mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-07-04 21:36:57 +02:00
Now passing alpha/beta to the kernel as arguments as before fp16 support; in case of fp16 arguments are cast on host and in kernel
This commit is contained in:
parent
57f09178d8
commit
c87e877bf2
|
@ -5,6 +5,7 @@ Development version (next release)
|
|||
- Fixed memory leaks related to events not being released
|
||||
- Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems
|
||||
- Fixed a bug related to the cache and retrieval of programs based on the OpenCL context
|
||||
- Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels
|
||||
- Added an option (-warm_up) to do a warm-up run before timing in the performance clients
|
||||
- Added tuned parameters for various devices (see README)
|
||||
|
||||
|
|
|
@ -109,6 +109,16 @@ R"(
|
|||
typedef real singlereal;
|
||||
#endif
|
||||
|
||||
// Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no
|
||||
// conversion, but half-precision is not supported as kernel argument so it is converted from float.
|
||||
#if PRECISION == 16
|
||||
typedef float real_arg;
|
||||
#define GetRealArg(x) (half)x
|
||||
#else
|
||||
typedef real real_arg;
|
||||
#define GetRealArg(x) x
|
||||
#endif
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
|
||||
|
|
|
@ -23,10 +23,10 @@ R"(
|
|||
|
||||
// Full version of the kernel with offsets and strided accesses
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
|
||||
__kernel void Xaxpy(const int n, const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
||||
#pragma unroll
|
||||
|
@ -41,10 +41,10 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
|
|||
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
|
||||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
|
||||
__kernel void XaxpyFast(const int n, const real_arg arg_alpha,
|
||||
const __global realV* restrict xgm,
|
||||
__global realV* ygm) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
#pragma unroll
|
||||
for (int w=0; w<WPT; ++w) {
|
||||
|
|
|
@ -212,16 +212,16 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
|
|||
// Full version of the kernel
|
||||
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
__kernel void Xgemv(const int m, const int n,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const int a_rotated,
|
||||
const __global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int do_conjugate, const int parameter,
|
||||
const int kl, const int ku) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
// Local memory for the vector X
|
||||
__local real xlm[WGS1];
|
||||
|
|
|
@ -96,16 +96,16 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
|
|||
// --> 'do_conjugate' is 0
|
||||
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
__kernel void XgemvFast(const int m, const int n,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const int a_rotated,
|
||||
const __global realVF* restrict agm, const int a_offset, const int a_ld,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int do_conjugate, const int parameter,
|
||||
const int kl, const int ku) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
// Local memory for the vector X
|
||||
__local real xlm[WGS2];
|
||||
|
@ -198,16 +198,16 @@ __kernel void XgemvFast(const int m, const int n,
|
|||
// --> 'do_conjugate' is 0
|
||||
__attribute__((reqd_work_group_size(WGS3, 1, 1)))
|
||||
__kernel void XgemvFastRot(const int m, const int n,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const int a_rotated,
|
||||
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int do_conjugate, const int parameter,
|
||||
const int kl, const int ku) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
// Local memory for the vector X
|
||||
__local real xlm[WGS3];
|
||||
|
|
|
@ -20,12 +20,12 @@ R"(
|
|||
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
|
||||
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
__kernel void Xger(const int max1, const int max2,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* ygm, const int y_offset, const int y_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_rowmajor) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Register storage for X and Y
|
||||
real xvalues[WPT];
|
||||
|
|
|
@ -20,11 +20,11 @@ R"(
|
|||
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
|
||||
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
__kernel void Xher(const int n,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_upper, const int is_rowmajor) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Register storage for X and XT
|
||||
real xvalues[WPT];
|
||||
|
|
|
@ -20,12 +20,12 @@ R"(
|
|||
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
|
||||
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
__kernel void Xher2(const int n,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const real_arg arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* restrict ygm, const int y_offset, const int y_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_upper, const int is_rowmajor) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Register storage for X and Y
|
||||
real xvalues[WPT];
|
||||
|
|
|
@ -39,8 +39,8 @@ __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
|
|||
__kernel void CopyMatrixFast(const int ld,
|
||||
__global const realC* restrict src,
|
||||
__global realC* dest,
|
||||
const __constant real* restrict arg_alpha) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real_arg arg_alpha) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
#pragma unroll
|
||||
for (int w_one=0; w_one<COPY_WPT; ++w_one) {
|
||||
const int id_one = get_global_id(0);
|
||||
|
|
|
@ -31,9 +31,9 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
|
|||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const real_arg arg_alpha,
|
||||
const int do_conjugate) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
#pragma unroll
|
||||
|
@ -72,10 +72,10 @@ __kernel void CopyMatrix(const int src_one, const int src_two,
|
|||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const real_arg arg_alpha,
|
||||
const int upper, const int lower,
|
||||
const int diagonal_imag_zero) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
#pragma unroll
|
||||
|
|
|
@ -40,8 +40,8 @@ __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
|
|||
__kernel void TransposeMatrixFast(const int ld,
|
||||
__global const realT* restrict src,
|
||||
__global realT* dest,
|
||||
const __constant real* restrict arg_alpha) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real_arg arg_alpha) {
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
|
||||
// way over workgroups, breaking memory-bank dependencies.
|
||||
|
|
|
@ -31,9 +31,9 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
|
|||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const real_arg arg_alpha,
|
||||
const int do_conjugate) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Local memory to store a tile of the matrix (for coalescing)
|
||||
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
|
||||
|
@ -95,10 +95,10 @@ __kernel void TransposeMatrix(const int src_one, const int src_two,
|
|||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const real_arg arg_alpha,
|
||||
const int upper, const int lower,
|
||||
const int diagonal_imag_zero) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
|
||||
// Local memory to store a tile of the matrix (for coalescing)
|
||||
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
|
||||
|
|
|
@ -270,13 +270,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
// Main entry point of the kernel. This is the upper-triangular version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
// Skip these threads if they do not contain threads contributing to the upper-triangle
|
||||
if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
|
||||
|
@ -310,13 +310,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
|
|||
// Main entry point of the kernel. This is the lower-triangular version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void XgemmLower(const int kSizeN, const int kSizeK,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
// Skip these threads if they do not contain threads contributing to the lower-triangle
|
||||
if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
|
||||
|
@ -354,13 +354,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
|
|||
// Main entry point of the kernel. This is the regular full version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const real_arg arg_alpha,
|
||||
const real_arg arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
const real alpha = GetRealArg(arg_alpha);
|
||||
const real beta = GetRealArg(arg_beta);
|
||||
|
||||
// Allocates workgroup-private memory (local memory)
|
||||
#if SA == 1
|
||||
|
|
|
@ -88,10 +88,6 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
|
|||
}
|
||||
}
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context, 1);
|
||||
alpha_buffer.Write(queue, 1, &alpha);
|
||||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
@ -101,7 +97,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
|
|||
kernel.SetArgument(0, static_cast<int>(src_ld));
|
||||
kernel.SetArgument(1, src());
|
||||
kernel.SetArgument(2, dest());
|
||||
kernel.SetArgument(3, alpha_buffer());
|
||||
kernel.SetArgument(3, GetRealArg(alpha));
|
||||
}
|
||||
else {
|
||||
kernel.SetArgument(0, static_cast<int>(src_one));
|
||||
|
@ -114,7 +110,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
|
|||
kernel.SetArgument(7, static_cast<int>(dest_ld));
|
||||
kernel.SetArgument(8, static_cast<int>(dest_offset));
|
||||
kernel.SetArgument(9, dest());
|
||||
kernel.SetArgument(10, alpha_buffer());
|
||||
kernel.SetArgument(10, GetRealArg(alpha));
|
||||
if (do_pad) {
|
||||
kernel.SetArgument(11, static_cast<int>(do_conjugate));
|
||||
}
|
||||
|
|
|
@ -59,20 +59,16 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
|
|||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
|
||||
// Sets the kernel arguments
|
||||
if (use_fast_kernel) {
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, alpha_buffer());
|
||||
kernel.SetArgument(1, GetRealArg(alpha));
|
||||
kernel.SetArgument(2, x_buffer());
|
||||
kernel.SetArgument(3, y_buffer());
|
||||
}
|
||||
else {
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, alpha_buffer());
|
||||
kernel.SetArgument(1, GetRealArg(alpha));
|
||||
kernel.SetArgument(2, x_buffer());
|
||||
kernel.SetArgument(3, static_cast<int>(x_offset));
|
||||
kernel.SetArgument(4, static_cast<int>(x_inc));
|
||||
|
|
|
@ -126,12 +126,6 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
|
|||
local_size = db_["WGS3"];
|
||||
}
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
beta_buffer.Write(queue_, 1, &beta);
|
||||
|
||||
// Retrieves the Xgemv kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
|
@ -140,8 +134,8 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
|
|||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(m_real));
|
||||
kernel.SetArgument(1, static_cast<int>(n_real));
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(2, GetRealArg(alpha));
|
||||
kernel.SetArgument(3, GetRealArg(beta));
|
||||
kernel.SetArgument(4, static_cast<int>(a_rotated));
|
||||
kernel.SetArgument(5, a_buffer());
|
||||
kernel.SetArgument(6, static_cast<int>(a_offset));
|
||||
|
|
|
@ -56,10 +56,6 @@ StatusCode Xger<T>::DoGer(const Layout layout,
|
|||
status = TestVectorY(n, y_buffer, y_offset, y_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
|
@ -68,7 +64,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
|
|||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(a_one));
|
||||
kernel.SetArgument(1, static_cast<int>(a_two));
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(2, GetRealArg(alpha));
|
||||
kernel.SetArgument(3, x_buffer());
|
||||
kernel.SetArgument(4, static_cast<int>(x_offset));
|
||||
kernel.SetArgument(5, static_cast<int>(x_inc));
|
||||
|
|
|
@ -70,10 +70,6 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
|
|||
// Creates a matching version of alpha
|
||||
const auto matching_alpha = GetAlpha(alpha);
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &matching_alpha);
|
||||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
|
@ -81,7 +77,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
|
|||
|
||||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, alpha_buffer());
|
||||
kernel.SetArgument(1, GetRealArg(matching_alpha));
|
||||
kernel.SetArgument(2, x_buffer());
|
||||
kernel.SetArgument(3, static_cast<int>(x_offset));
|
||||
kernel.SetArgument(4, static_cast<int>(x_inc));
|
||||
|
|
|
@ -58,10 +58,6 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
|
|||
status = TestVectorY(n, y_buffer, y_offset, y_inc);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
|
||||
|
@ -69,7 +65,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
|
|||
|
||||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, alpha_buffer());
|
||||
kernel.SetArgument(1, GetRealArg(alpha));
|
||||
kernel.SetArgument(2, x_buffer());
|
||||
kernel.SetArgument(3, static_cast<int>(x_offset));
|
||||
kernel.SetArgument(4, static_cast<int>(x_inc));
|
||||
|
|
|
@ -118,12 +118,6 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
|
||||
const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
beta_buffer.Write(queue_, 1, &beta);
|
||||
|
||||
// Events of all kernels (including pre/post processing kernels)
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto emptyEventList = std::vector<Event>();
|
||||
|
@ -174,8 +168,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
kernel.SetArgument(0, static_cast<int>(m_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(2, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(3, alpha_buffer());
|
||||
kernel.SetArgument(4, beta_buffer());
|
||||
kernel.SetArgument(3, GetRealArg(alpha));
|
||||
kernel.SetArgument(4, GetRealArg(beta));
|
||||
kernel.SetArgument(5, a_temp());
|
||||
kernel.SetArgument(6, b_temp());
|
||||
kernel.SetArgument(7, c_temp());
|
||||
|
|
|
@ -107,12 +107,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
|
|||
auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
|
||||
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
// Convert the arguments to complex versions
|
||||
auto complex_beta = T{beta, static_cast<U>(0.0)};
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
beta_buffer.Write(queue_, 1, &complex_beta);
|
||||
|
||||
// Events of all kernels (including pre/post processing kernels)
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
|
@ -180,8 +176,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
|
|||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(2, GetRealArg(alpha));
|
||||
kernel.SetArgument(3, GetRealArg(complex_beta));
|
||||
kernel.SetArgument(4, a1_temp());
|
||||
kernel.SetArgument(5, b2_temp());
|
||||
kernel.SetArgument(6, c_temp());
|
||||
|
@ -202,10 +198,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
|
|||
// Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
|
||||
auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
|
||||
auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
|
||||
alpha_buffer.Write(queue_, 1, &conjugate_alpha);
|
||||
beta_buffer.Write(queue_, 1, &complex_one);
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(2, GetRealArg(conjugate_alpha));
|
||||
kernel.SetArgument(3, GetRealArg(complex_one));
|
||||
kernel.SetArgument(4, b1_temp());
|
||||
kernel.SetArgument(5, a2_temp());
|
||||
|
||||
|
|
|
@ -98,13 +98,9 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
|
|||
auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
|
||||
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
// Convert the arguments to complex versions
|
||||
auto complex_alpha = T{alpha, static_cast<U>(0.0)};
|
||||
auto complex_beta = T{beta, static_cast<U>(0.0)};
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &complex_alpha);
|
||||
beta_buffer.Write(queue_, 1, &complex_beta);
|
||||
|
||||
// Events of all kernels (including pre/post processing kernels)
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
|
@ -152,8 +148,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
|
|||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(2, GetRealArg(complex_alpha));
|
||||
kernel.SetArgument(3, GetRealArg(complex_beta));
|
||||
kernel.SetArgument(4, a_temp());
|
||||
kernel.SetArgument(5, b_temp());
|
||||
kernel.SetArgument(6, c_temp());
|
||||
|
|
|
@ -97,12 +97,6 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
|
|||
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
|
||||
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
beta_buffer.Write(queue_, 1, &beta);
|
||||
|
||||
// Events of all kernels (including pre/post processing kernels)
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto emptyEventList = std::vector<Event>();
|
||||
|
@ -149,8 +143,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
|
|||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(2, GetRealArg(alpha));
|
||||
kernel.SetArgument(3, GetRealArg(beta));
|
||||
kernel.SetArgument(4, a_temp());
|
||||
kernel.SetArgument(5, b_temp());
|
||||
kernel.SetArgument(6, c_temp());
|
||||
|
@ -170,8 +164,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
|
|||
|
||||
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
|
||||
auto one = static_cast<T>(1);
|
||||
beta_buffer.Write(queue_, 1, &one);
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(3, GetRealArg(one));
|
||||
kernel.SetArgument(4, b_temp());
|
||||
kernel.SetArgument(5, a_temp());
|
||||
|
||||
|
|
|
@ -90,12 +90,6 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
|
|||
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
|
||||
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
beta_buffer.Write(queue_, 1, &beta);
|
||||
|
||||
// Events of all kernels (including pre/post processing kernels)
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto emptyEventList = std::vector<Event>();
|
||||
|
@ -132,8 +126,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
|
|||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(2, GetRealArg(alpha));
|
||||
kernel.SetArgument(3, GetRealArg(beta));
|
||||
kernel.SetArgument(4, a_temp());
|
||||
kernel.SetArgument(5, a_temp());
|
||||
kernel.SetArgument(6, c_temp());
|
||||
|
|
|
@ -86,11 +86,10 @@ class TuneCopy {
|
|||
std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentInput(a_mat);
|
||||
tuner.AddArgumentOutput(b_mat);
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentScalar(GetRealArg(args.alpha));
|
||||
}
|
||||
|
||||
// Describes how to compute the performance metrics
|
||||
|
|
|
@ -86,7 +86,6 @@ class TunePad {
|
|||
std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
|
@ -97,7 +96,7 @@ class TunePad {
|
|||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(0);
|
||||
tuner.AddArgumentOutput(b_mat);
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentScalar(GetRealArg(args.alpha));
|
||||
tuner.AddArgumentScalar(0);
|
||||
}
|
||||
|
||||
|
|
|
@ -91,11 +91,10 @@ class TuneTranspose {
|
|||
std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentInput(a_mat);
|
||||
tuner.AddArgumentOutput(b_mat);
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentScalar(GetRealArg(args.alpha));
|
||||
}
|
||||
|
||||
// Describes how to compute the performance metrics
|
||||
|
|
|
@ -90,7 +90,6 @@ class TunePadTranspose {
|
|||
std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
|
@ -101,7 +100,7 @@ class TunePadTranspose {
|
|||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(0);
|
||||
tuner.AddArgumentOutput(b_mat);
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentScalar(GetRealArg(args.alpha));
|
||||
tuner.AddArgumentScalar(0);
|
||||
}
|
||||
|
||||
|
|
|
@ -89,9 +89,8 @@ class TuneXaxpy {
|
|||
std::vector<T> &x_vec, std::vector<T> &y_vec,
|
||||
std::vector<T> &, std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentScalar(GetRealArg(args.alpha));
|
||||
tuner.AddArgumentInput(x_vec);
|
||||
tuner.AddArgumentOutput(y_vec);
|
||||
}
|
||||
|
|
|
@ -121,13 +121,11 @@ class TuneXgemm {
|
|||
std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
auto beta_buffer = std::vector<T>{args.beta};
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.k));
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentInput(beta_buffer);
|
||||
tuner.AddArgumentScalar(GetRealArg(args.alpha));
|
||||
tuner.AddArgumentScalar(GetRealArg(args.beta));
|
||||
tuner.AddArgumentInput(a_mat);
|
||||
tuner.AddArgumentInput(b_mat);
|
||||
tuner.AddArgumentOutput(c_mat);
|
||||
|
|
|
@ -96,13 +96,11 @@ class TuneXgemv {
|
|||
std::vector<T> &x_vec, std::vector<T> &y_vec,
|
||||
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
auto beta_buffer = std::vector<T>{args.beta};
|
||||
auto a_rotated = (V==3) ? 1 : 0;
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentInput(beta_buffer);
|
||||
tuner.AddArgumentScalar(GetRealArg(args.alpha));
|
||||
tuner.AddArgumentScalar(GetRealArg(args.beta));
|
||||
tuner.AddArgumentScalar(static_cast<int>(a_rotated));
|
||||
tuner.AddArgumentInput(a_mat);
|
||||
tuner.AddArgumentScalar(0);
|
||||
|
|
|
@ -85,10 +85,9 @@ class TuneXger {
|
|||
std::vector<T> &x_vec, std::vector<T> &y_vec,
|
||||
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentScalar(GetRealArg(args.alpha));
|
||||
tuner.AddArgumentInput(x_vec);
|
||||
tuner.AddArgumentScalar(0); // x_offset
|
||||
tuner.AddArgumentScalar(1); // x_increment
|
||||
|
|
|
@ -332,6 +332,14 @@ void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_com
|
|||
result.Write(queue, size, result_cpu);
|
||||
}
|
||||
|
||||
// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
|
||||
// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
|
||||
template <> typename RealArg<half>::Type GetRealArg(const half value) { return HalfToFloat(value); }
|
||||
template <> typename RealArg<float>::Type GetRealArg(const float value) { return value; }
|
||||
template <> typename RealArg<double>::Type GetRealArg(const double value) { return value; }
|
||||
template <> typename RealArg<float2>::Type GetRealArg(const float2 value) { return value; }
|
||||
template <> typename RealArg<double2>::Type GetRealArg(const double2 value) { return value; }
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Rounding functions performing ceiling and division operations
|
||||
|
|
|
@ -227,6 +227,12 @@ void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& sour
|
|||
Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
|
||||
void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
|
||||
|
||||
// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
|
||||
// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
|
||||
template <typename T> struct RealArg { using Type = T; };
|
||||
template <> struct RealArg<half> { using Type = float; };
|
||||
template <typename T> typename RealArg<T>::Type GetRealArg(const T value);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Rounding functions
|
||||
|
|
Loading…
Reference in a new issue