Implemented the XHAD Hadamard product routine
parent
ae66782eab
commit
69ed46c8da
|
@ -1,4 +1,8 @@
|
|||
|
||||
Development (next version)
|
||||
- Added non-BLAS level-1 routines:
|
||||
* SHAD/DHAD/CHAD/ZHAD/HHAD (Hadamard element-wise vector-vector product)
|
||||
|
||||
Version 1.3.0
|
||||
- Re-designed and integrated the auto-tuner, no more dependency on CLTune
|
||||
- Made it possible to override the tuning parameters in the clients straight from JSON tuning files
|
||||
|
|
|
@ -319,6 +319,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
|
|||
| IxAMIN | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| IxMAX | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xHAD | ✔ | ✔ | ✔ | ✔ | ✔ | (Hadamard product)
|
||||
| xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xIM2COL | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ This file gives an overview of the main features planned for addition to CLBlast
|
|||
| [#207](https://github.com/CNugteren/CLBlast/issues/207) | Dec '17 | CNugteren | ✔ | Tuning of the TRSM/TRSV routines |
|
||||
| [#195](https://github.com/CNugteren/CLBlast/issues/195) | Jan '18 | CNugteren | ✔ | Extra GEMM API with pre-allocated temporary buffer |
|
||||
| [#95](https://github.com/CNugteren/CLBlast/issues/95) & #237 | Jan '18 | CNugteren | ✔ | Implement strided batch GEMM |
|
||||
| [#224](https://github.com/CNugteren/CLBlast/issues/224) | Jan-Feb '18 | CNugteren | | Implement Hadamard product (element-wise vector-vector product) |
|
||||
| [#224](https://github.com/CNugteren/CLBlast/issues/224) | Jan-Feb '18 | CNugteren | ✔ | Implement Hadamard product (element-wise vector-vector product) |
|
||||
| [#233](https://github.com/CNugteren/CLBlast/issues/233) | Feb '18 | CNugteren | | Add CLBlast to common package managers |
|
||||
| [#223](https://github.com/CNugteren/CLBlast/issues/223) | Feb '18 | CNugteren | | Python OpenCL interface |
|
||||
| [#169](https://github.com/CNugteren/CLBlast/issues/169) | ?? | dividiti | | Problem-specific tuning parameter selection |
|
||||
|
|
|
@ -0,0 +1,145 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file contains the Xhad kernel. It contains one fast vectorized version in case of unit
|
||||
// strides (incx=incy=incz=1) and no offsets (offx=offy=offz=0). Another version is more general,
|
||||
// but doesn't support vector data-types. Based on the XAXPY kernels.
|
||||
//
|
||||
// This kernel uses the level-1 BLAS common tuning parameters.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||
R"(
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// A vector-vector multiply function. See also level1.opencl for a vector-scalar version
|
||||
INLINE_FUNC realV MultiplyVectorVector(realV cvec, const realV aval, const realV bvec) {
|
||||
#if VW == 1
|
||||
Multiply(cvec, aval, bvec);
|
||||
#elif VW == 2
|
||||
Multiply(cvec.x, aval.x, bvec.x);
|
||||
Multiply(cvec.y, aval.y, bvec.y);
|
||||
#elif VW == 4
|
||||
Multiply(cvec.x, aval.x, bvec.x);
|
||||
Multiply(cvec.y, aval.y, bvec.y);
|
||||
Multiply(cvec.z, aval.z, bvec.z);
|
||||
Multiply(cvec.w, aval.w, bvec.w);
|
||||
#elif VW == 8
|
||||
Multiply(cvec.s0, aval.s0, bvec.s0);
|
||||
Multiply(cvec.s1, aval.s1, bvec.s1);
|
||||
Multiply(cvec.s2, aval.s2, bvec.s2);
|
||||
Multiply(cvec.s3, aval.s3, bvec.s3);
|
||||
Multiply(cvec.s4, aval.s4, bvec.s4);
|
||||
Multiply(cvec.s5, aval.s5, bvec.s5);
|
||||
Multiply(cvec.s6, aval.s6, bvec.s6);
|
||||
Multiply(cvec.s7, aval.s7, bvec.s7);
|
||||
#elif VW == 16
|
||||
Multiply(cvec.s0, aval.s0, bvec.s0);
|
||||
Multiply(cvec.s1, aval.s1, bvec.s1);
|
||||
Multiply(cvec.s2, aval.s2, bvec.s2);
|
||||
Multiply(cvec.s3, aval.s3, bvec.s3);
|
||||
Multiply(cvec.s4, aval.s4, bvec.s4);
|
||||
Multiply(cvec.s5, aval.s5, bvec.s5);
|
||||
Multiply(cvec.s6, aval.s6, bvec.s6);
|
||||
Multiply(cvec.s7, aval.s7, bvec.s7);
|
||||
Multiply(cvec.s8, aval.s8, bvec.s8);
|
||||
Multiply(cvec.s9, aval.s9, bvec.s9);
|
||||
Multiply(cvec.sA, aval.sA, bvec.sA);
|
||||
Multiply(cvec.sB, aval.sB, bvec.sB);
|
||||
Multiply(cvec.sC, aval.sC, bvec.sC);
|
||||
Multiply(cvec.sD, aval.sD, bvec.sD);
|
||||
Multiply(cvec.sE, aval.sE, bvec.sE);
|
||||
Multiply(cvec.sF, aval.sF, bvec.sF);
|
||||
#endif
|
||||
return cvec;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Full version of the kernel with offsets and strided accesses
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void Xhad(const int n, const real_arg arg_alpha, const real_arg arg_beta,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* restrict ygm, const int y_offset, const int y_inc,
|
||||
__global real* zgm, const int z_offset, const int z_inc) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
||||
for (int id = get_global_id(0); id < n; id += get_global_size(0)) {
|
||||
real xvalue = xgm[id*x_inc + x_offset];
|
||||
real yvalue = ygm[id*y_inc + y_offset];
|
||||
real zvalue = zgm[id*z_inc + z_offset];
|
||||
real result;
|
||||
real alpha_times_x;
|
||||
Multiply(alpha_times_x, alpha, xvalue);
|
||||
Multiply(result, alpha_times_x, yvalue);
|
||||
MultiplyAdd(result, beta, zvalue);
|
||||
zgm[id*z_inc + z_offset] = result;
|
||||
}
|
||||
}
|
||||
|
||||
// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
|
||||
// assumes that 'n' is dividable by 'VW' and 'WPT'.
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void XhadFaster(const int n, const real_arg arg_alpha, const real_arg arg_beta,
|
||||
const __global realV* restrict xgm, const __global realV* restrict ygm,
|
||||
__global realV* zgm) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
if (get_global_id(0) < n / (VW)) {
|
||||
#pragma unroll
|
||||
for (int _w = 0; _w < WPT; _w += 1) {
|
||||
const int id = _w*get_global_size(0) + get_global_id(0);
|
||||
realV xvalue = xgm[id];
|
||||
realV yvalue = ygm[id];
|
||||
realV zvalue = zgm[id];
|
||||
realV result;
|
||||
realV alpha_times_x;
|
||||
alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
|
||||
result = MultiplyVectorVector(result, alpha_times_x, yvalue);
|
||||
zgm[id] = MultiplyAddVector(result, beta, zvalue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
|
||||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
void XhadFastest(const int n, const real_arg arg_alpha, const real_arg arg_beta,
|
||||
const __global realV* restrict xgm, const __global realV* restrict ygm,
|
||||
__global realV* zgm) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
#pragma unroll
|
||||
for (int _w = 0; _w < WPT; _w += 1) {
|
||||
const int id = _w*get_global_size(0) + get_global_id(0);
|
||||
realV xvalue = xgm[id];
|
||||
realV yvalue = ygm[id];
|
||||
realV zvalue = zgm[id];
|
||||
realV result;
|
||||
realV alpha_times_x;
|
||||
alpha_times_x = MultiplyVector(alpha_times_x, alpha, xvalue);
|
||||
result = MultiplyVectorVector(result, alpha_times_x, yvalue);
|
||||
zgm[id] = MultiplyAddVector(result, beta, zvalue);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// End of the C++11 raw string literal
|
||||
)"
|
||||
|
||||
// =================================================================================================
|
|
@ -24,7 +24,7 @@ template <typename T>
|
|||
Xhad<T>::Xhad(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
|
||||
#include "../../kernels/level1/level1.opencl"
|
||||
#include "../../kernels/level1/xaxpy.opencl"
|
||||
#include "../../kernels/level1/xhad.opencl"
|
||||
}) {
|
||||
}
|
||||
|
||||
|
@ -45,6 +45,62 @@ void Xhad<T>::DoHad(const size_t n, const T alpha,
|
|||
TestVectorY(n, y_buffer, y_offset, y_inc);
|
||||
TestVectorY(n, z_buffer, z_offset, z_inc); // TODO: Make a TestVectorZ function with error codes
|
||||
|
||||
// Determines whether or not the fast-version can be used
|
||||
const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) &&
|
||||
(y_offset == 0) && (y_inc == 1) &&
|
||||
(z_offset == 0) && (z_inc == 1) &&
|
||||
IsMultiple(n, db_["WPT"]*db_["VW"]);
|
||||
const auto use_fastest_kernel = use_faster_kernel &&
|
||||
IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
|
||||
|
||||
// If possible, run the fast-version of the kernel
|
||||
const auto kernel_name = (use_fastest_kernel) ? "XhadFastest" :
|
||||
(use_faster_kernel) ? "XhadFaster" : "Xhad";
|
||||
|
||||
// Retrieves the Xhad kernel from the compiled binary
|
||||
auto kernel = Kernel(program_, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
if (use_faster_kernel || use_fastest_kernel) {
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, GetRealArg(alpha));
|
||||
kernel.SetArgument(2, GetRealArg(beta));
|
||||
kernel.SetArgument(3, x_buffer());
|
||||
kernel.SetArgument(4, y_buffer());
|
||||
kernel.SetArgument(5, z_buffer());
|
||||
}
|
||||
else {
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, GetRealArg(alpha));
|
||||
kernel.SetArgument(2, GetRealArg(beta));
|
||||
kernel.SetArgument(3, x_buffer());
|
||||
kernel.SetArgument(4, static_cast<int>(x_offset));
|
||||
kernel.SetArgument(5, static_cast<int>(x_inc));
|
||||
kernel.SetArgument(6, y_buffer());
|
||||
kernel.SetArgument(7, static_cast<int>(y_offset));
|
||||
kernel.SetArgument(8, static_cast<int>(y_inc));
|
||||
kernel.SetArgument(9, z_buffer());
|
||||
kernel.SetArgument(10, static_cast<int>(z_offset));
|
||||
kernel.SetArgument(11, static_cast<int>(z_inc));
|
||||
}
|
||||
|
||||
// Launches the kernel
|
||||
if (use_fastest_kernel) {
|
||||
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
else if (use_faster_kernel) {
|
||||
auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
else {
|
||||
const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
||||
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
||||
auto local = std::vector<size_t>{db_["WGS"]};
|
||||
RunKernel(kernel, queue_, device_, global, local, event_);
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
|
|
@ -21,12 +21,43 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
template <typename T>
|
||||
StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
|
||||
for (auto index = size_t{0}; index < args.n; ++index) {
|
||||
const auto x = buffers_host.x_vec[index * args.x_inc + args.x_offset];
|
||||
const auto y = buffers_host.y_vec[index * args.y_inc + args.y_offset];
|
||||
const auto z = buffers_host.c_mat[index]; // * args.z_inc + args.z_offset];
|
||||
buffers_host.c_mat[index] = args.alpha * x * y + args.beta * z;
|
||||
}
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Half-precision version calling the above reference implementation after conversions
|
||||
template <>
|
||||
StatusCode RunReference<half>(const Arguments<half> &args, BuffersHost<half> &buffers_host) {
|
||||
auto x_buffer2 = HalfToFloatBuffer(buffers_host.x_vec);
|
||||
auto y_buffer2 = HalfToFloatBuffer(buffers_host.y_vec);
|
||||
auto c_buffer2 = HalfToFloatBuffer(buffers_host.c_mat);
|
||||
auto dummy = std::vector<float>(0);
|
||||
auto buffers2 = BuffersHost<float>{x_buffer2, y_buffer2, dummy, dummy, c_buffer2, dummy, dummy};
|
||||
auto args2 = Arguments<float>();
|
||||
args2.x_size = args.x_size; args2.y_size = args.y_size; args2.c_size = args.c_size;
|
||||
args2.x_inc = args.x_inc; args2.y_inc = args.y_inc; args2.n = args.n;
|
||||
args2.x_offset = args.x_offset; args2.y_offset = args.y_offset;
|
||||
args2.alpha = HalfToFloat(args.alpha); args2.beta = HalfToFloat(args.beta);
|
||||
auto status = RunReference(args2, buffers2);
|
||||
FloatToHalfBuffer(buffers_host.c_mat, buffers2.c_mat);
|
||||
return status;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class TestXhad {
|
||||
public:
|
||||
|
||||
// The BLAS level: 4 for the extra routines
|
||||
// The BLAS level: 4 for the extra routines (note: tested with matrix-size values for 'n')
|
||||
static size_t BLASLevel() { return 4; }
|
||||
|
||||
// The list of arguments relevant for this routine
|
||||
|
@ -34,7 +65,7 @@ public:
|
|||
return {kArgN,
|
||||
kArgXInc, kArgYInc,
|
||||
kArgXOffset, kArgYOffset,
|
||||
kArgAlpha};
|
||||
kArgAlpha, kArgBeta};
|
||||
}
|
||||
static std::vector<std::string> BuffersIn() { return {kBufVecX, kBufVecY, kBufMatC}; }
|
||||
static std::vector<std::string> BuffersOut() { return {kBufMatC}; }
|
||||
|
@ -134,19 +165,6 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
template <typename T>
|
||||
StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
|
||||
for (auto index = size_t{0}; index < args.n; ++index) {
|
||||
const auto x = buffers_host.x_vec[index * args.x_inc + args.x_offset];
|
||||
const auto y = buffers_host.y_vec[index * args.y_inc + args.y_offset];
|
||||
const auto z = buffers_host.c_mat[index]; // * args.z_inc + args.z_offset];
|
||||
buffers_host.c_mat[index] = x * y * args.alpha + z * args.beta;
|
||||
}
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
|
|
Loading…
Reference in New Issue