From c87e877bf23d2fe38a7da2898e1734a3cdeaf48c Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 10 Jul 2016 20:32:01 +0200 Subject: [PATCH] Now passing alpha/beta to the kernel as arguments as before fp16 support; in case of fp16 arguments are cast on host and in kernel --- CHANGELOG | 1 + src/kernels/common.opencl | 10 ++++++++++ src/kernels/level1/xaxpy.opencl | 8 ++++---- src/kernels/level2/xgemv.opencl | 8 ++++---- src/kernels/level2/xgemv_fast.opencl | 16 ++++++++-------- src/kernels/level2/xger.opencl | 4 ++-- src/kernels/level2/xher.opencl | 4 ++-- src/kernels/level2/xher2.opencl | 4 ++-- src/kernels/level3/copy_fast.opencl | 4 ++-- src/kernels/level3/copy_pad.opencl | 8 ++++---- src/kernels/level3/transpose_fast.opencl | 4 ++-- src/kernels/level3/transpose_pad.opencl | 8 ++++---- src/kernels/level3/xgemm_part2.opencl | 24 ++++++++++++------------ src/routines/common.hpp | 8 ++------ src/routines/level1/xaxpy.cpp | 8 ++------ src/routines/level2/xgemv.cpp | 10 ++-------- src/routines/level2/xger.cpp | 6 +----- src/routines/level2/xher.cpp | 6 +----- src/routines/level2/xher2.cpp | 6 +----- src/routines/level3/xgemm.cpp | 10 ++-------- src/routines/level3/xher2k.cpp | 16 +++++----------- src/routines/level3/xherk.cpp | 10 +++------- src/routines/level3/xsyr2k.cpp | 13 +++---------- src/routines/level3/xsyrk.cpp | 10 ++-------- src/tuning/kernels/copy_fast.cpp | 3 +-- src/tuning/kernels/copy_pad.cpp | 3 +-- src/tuning/kernels/transpose_fast.cpp | 3 +-- src/tuning/kernels/transpose_pad.cpp | 3 +-- src/tuning/kernels/xaxpy.cpp | 3 +-- src/tuning/kernels/xgemm.cpp | 6 ++---- src/tuning/kernels/xgemv.cpp | 6 ++---- src/tuning/kernels/xger.cpp | 3 +-- src/utilities.cpp | 8 ++++++++ src/utilities.hpp | 6 ++++++ 34 files changed, 105 insertions(+), 145 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 248db397..b6e09102 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,6 +5,7 @@ Development version (next release) - Fixed memory leaks related to events not being released - Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems - Fixed a bug related to the cache and retrieval of programs based on the OpenCL context +- Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels - Added an option (-warm_up) to do a warm-up run before timing in the performance clients - Added tuned parameters for various devices (see README) diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 08c47d87..9d2bb65e 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -109,6 +109,16 @@ R"( typedef real singlereal; #endif +// Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no +// conversion, but half-precision is not supported as kernel argument so it is converted from float. +#if PRECISION == 16 + typedef float real_arg; + #define GetRealArg(x) (half)x +#else + typedef real real_arg; + #define GetRealArg(x) x +#endif + // ================================================================================================= // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index e0efadc1..d533041b 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -23,10 +23,10 @@ R"( // Full version of the kernel with offsets and strided accesses __attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha, +__kernel void Xaxpy(const int n, const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc) { - const real alpha = arg_alpha[0]; + const real alpha = GetRealArg(arg_alpha); // Loops over the work that needs to be done (allows for an arbitrary number of threads) #pragma unroll @@ -41,10 +41,10 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha, // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. __attribute__((reqd_work_group_size(WGS, 1, 1))) -__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha, +__kernel void XaxpyFast(const int n, const real_arg arg_alpha, const __global realV* restrict xgm, __global realV* ygm) { - const real alpha = arg_alpha[0]; + const real alpha = GetRealArg(arg_alpha); #pragma unroll for (int w=0; w 'do_conjugate' is 0 __attribute__((reqd_work_group_size(WGS2, 1, 1))) __kernel void XgemvFast(const int m, const int n, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, + const real_arg arg_alpha, + const real_arg arg_beta, const int a_rotated, const __global realVF* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, const int do_conjugate, const int parameter, const int kl, const int ku) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); // Local memory for the vector X __local real xlm[WGS2]; @@ -198,16 +198,16 @@ __kernel void XgemvFast(const int m, const int n, // --> 'do_conjugate' is 0 __attribute__((reqd_work_group_size(WGS3, 1, 1))) __kernel void XgemvFastRot(const int m, const int n, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, + const real_arg arg_alpha, + const real_arg arg_beta, const int a_rotated, const __global realVFR* restrict agm, const int a_offset, const int a_ld, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* ygm, const int y_offset, const int y_inc, const int do_conjugate, const int parameter, const int kl, const int ku) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); // Local memory for the vector X __local real xlm[WGS3]; diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl index 63817afb..f218a346 100644 --- a/src/kernels/level2/xger.opencl +++ b/src/kernels/level2/xger.opencl @@ -20,12 +20,12 @@ R"( // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC) __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) __kernel void Xger(const int max1, const int max2, - const __constant real* restrict arg_alpha, + const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* ygm, const int y_offset, const int y_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_rowmajor) { - const real alpha = arg_alpha[0]; + const real alpha = GetRealArg(arg_alpha); // Register storage for X and Y real xvalues[WPT]; diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl index fc635f2e..1200ee63 100644 --- a/src/kernels/level2/xher.opencl +++ b/src/kernels/level2/xher.opencl @@ -20,11 +20,11 @@ R"( // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR) __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) __kernel void Xher(const int n, - const __constant real* restrict arg_alpha, + const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_upper, const int is_rowmajor) { - const real alpha = arg_alpha[0]; + const real alpha = GetRealArg(arg_alpha); // Register storage for X and XT real xvalues[WPT]; diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl index a66f255f..d0f41571 100644 --- a/src/kernels/level2/xher2.opencl +++ b/src/kernels/level2/xher2.opencl @@ -20,12 +20,12 @@ R"( // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2) __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) __kernel void Xher2(const int n, - const __constant real* restrict arg_alpha, + const real_arg arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_upper, const int is_rowmajor) { - const real alpha = arg_alpha[0]; + const real alpha = GetRealArg(arg_alpha); // Register storage for X and Y real xvalues[WPT]; diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl index 09e54e6d..dd975bf1 100644 --- a/src/kernels/level3/copy_fast.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -39,8 +39,8 @@ __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) __kernel void CopyMatrixFast(const int ld, __global const realC* restrict src, __global realC* dest, - const __constant real* restrict arg_alpha) { - const real alpha = arg_alpha[0]; + const real_arg arg_alpha) { + const real alpha = GetRealArg(arg_alpha); #pragma unroll for (int w_one=0; w_one GetGroupID0()*MWG) { @@ -354,13 +354,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK, // Main entry point of the kernel. This is the regular full version. __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, + const real_arg arg_alpha, + const real_arg arg_beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); // Allocates workgroup-private memory (local memory) #if SA == 1 diff --git a/src/routines/common.hpp b/src/routines/common.hpp index c99cd39d..e624a2b1 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -88,10 +88,6 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont } } - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context, 1); - alpha_buffer.Write(queue, 1, &alpha); - // Retrieves the kernel from the compiled binary try { auto kernel = Kernel(program, kernel_name); @@ -101,7 +97,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont kernel.SetArgument(0, static_cast(src_ld)); kernel.SetArgument(1, src()); kernel.SetArgument(2, dest()); - kernel.SetArgument(3, alpha_buffer()); + kernel.SetArgument(3, GetRealArg(alpha)); } else { kernel.SetArgument(0, static_cast(src_one)); @@ -114,7 +110,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont kernel.SetArgument(7, static_cast(dest_ld)); kernel.SetArgument(8, static_cast(dest_offset)); kernel.SetArgument(9, dest()); - kernel.SetArgument(10, alpha_buffer()); + kernel.SetArgument(10, GetRealArg(alpha)); if (do_pad) { kernel.SetArgument(11, static_cast(do_conjugate)); } diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp index 5b6c9e77..3445e2b5 100644 --- a/src/routines/level1/xaxpy.cpp +++ b/src/routines/level1/xaxpy.cpp @@ -59,20 +59,16 @@ StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); auto kernel = Kernel(program, kernel_name); - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, y_buffer()); } else { kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 21fb397c..2842ef07 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -126,12 +126,6 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, local_size = db_["WGS3"]; } - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Retrieves the Xgemv kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); @@ -140,8 +134,8 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, // Sets the kernel arguments kernel.SetArgument(0, static_cast(m_real)); kernel.SetArgument(1, static_cast(n_real)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, static_cast(a_rotated)); kernel.SetArgument(5, a_buffer()); kernel.SetArgument(6, static_cast(a_offset)); diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp index 353047d2..29cffe0c 100644 --- a/src/routines/level2/xger.cpp +++ b/src/routines/level2/xger.cpp @@ -56,10 +56,6 @@ StatusCode Xger::DoGer(const Layout layout, status = TestVectorY(n, y_buffer, y_offset, y_inc); if (ErrorIn(status)) { return status; } - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); @@ -68,7 +64,7 @@ StatusCode Xger::DoGer(const Layout layout, // Sets the kernel arguments kernel.SetArgument(0, static_cast(a_one)); kernel.SetArgument(1, static_cast(a_two)); - kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); kernel.SetArgument(3, x_buffer()); kernel.SetArgument(4, static_cast(x_offset)); kernel.SetArgument(5, static_cast(x_inc)); diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp index ed8ba9e9..6dd95938 100644 --- a/src/routines/level2/xher.cpp +++ b/src/routines/level2/xher.cpp @@ -70,10 +70,6 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, // Creates a matching version of alpha const auto matching_alpha = GetAlpha(alpha); - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &matching_alpha); - // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); @@ -81,7 +77,7 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, // Sets the kernel arguments kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(matching_alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp index 50572cea..3d57a9b9 100644 --- a/src/routines/level2/xher2.cpp +++ b/src/routines/level2/xher2.cpp @@ -58,10 +58,6 @@ StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, status = TestVectorY(n, y_buffer, y_offset, y_inc); if (ErrorIn(status)) { return status; } - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); @@ -69,7 +65,7 @@ StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, // Sets the kernel arguments kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 9ea5559c..97e8db7e 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -118,12 +118,6 @@ StatusCode Xgemm::DoGemm(const Layout layout, const auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); const auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, m_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); @@ -174,8 +168,8 @@ StatusCode Xgemm::DoGemm(const Layout layout, kernel.SetArgument(0, static_cast(m_ceiled)); kernel.SetArgument(1, static_cast(n_ceiled)); kernel.SetArgument(2, static_cast(k_ceiled)); - kernel.SetArgument(3, alpha_buffer()); - kernel.SetArgument(4, beta_buffer()); + kernel.SetArgument(3, GetRealArg(alpha)); + kernel.SetArgument(4, GetRealArg(beta)); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, b_temp()); kernel.SetArgument(7, c_temp()); diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index bd7a053e..65e2be55 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -107,12 +107,8 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + // Convert the arguments to complex versions auto complex_beta = T{beta, static_cast(0.0)}; - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &complex_beta); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); @@ -180,8 +176,8 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(complex_beta)); kernel.SetArgument(4, a1_temp()); kernel.SetArgument(5, b2_temp()); kernel.SetArgument(6, c_temp()); @@ -202,10 +198,8 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; auto complex_one = T{static_cast(1.0), static_cast(0.0)}; - alpha_buffer.Write(queue_, 1, &conjugate_alpha); - beta_buffer.Write(queue_, 1, &complex_one); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(conjugate_alpha)); + kernel.SetArgument(3, GetRealArg(complex_one)); kernel.SetArgument(4, b1_temp()); kernel.SetArgument(5, a2_temp()); diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index 6ef7f21f..cc87e3e9 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -98,13 +98,9 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + // Convert the arguments to complex versions auto complex_alpha = T{alpha, static_cast(0.0)}; auto complex_beta = T{beta, static_cast(0.0)}; - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &complex_alpha); - beta_buffer.Write(queue_, 1, &complex_beta); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); @@ -152,8 +148,8 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(complex_alpha)); + kernel.SetArgument(3, GetRealArg(complex_beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, b_temp()); kernel.SetArgument(6, c_temp()); diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index 424d4d2d..18a1eac7 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -97,12 +97,6 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); @@ -149,8 +143,8 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, b_temp()); kernel.SetArgument(6, c_temp()); @@ -170,8 +164,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // Swaps the arguments for matrices A and B, and sets 'beta' to 1 auto one = static_cast(1); - beta_buffer.Write(queue_, 1, &one); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(3, GetRealArg(one)); kernel.SetArgument(4, b_temp()); kernel.SetArgument(5, a_temp()); diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index f56c232b..1992cec1 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -90,12 +90,6 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer(context_, 1); - auto beta_buffer = Buffer(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector(); auto emptyEventList = std::vector(); @@ -132,8 +126,8 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); kernel.SetArgument(1, static_cast(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, c_temp()); diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index 34269bc7..78ded56e 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -86,11 +86,10 @@ class TuneCopy { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); } // Describes how to compute the performance metrics diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index 1e0dccd3..90f5ea82 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -86,7 +86,6 @@ class TunePad { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(static_cast(args.m)); @@ -97,7 +96,7 @@ class TunePad { tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(0); tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(0); } diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 7ac19cb6..10fa80cb 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -91,11 +91,10 @@ class TuneTranspose { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); } // Describes how to compute the performance metrics diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index 63274415..507718eb 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -90,7 +90,6 @@ class TunePadTranspose { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(static_cast(args.m)); @@ -101,7 +100,7 @@ class TunePadTranspose { tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(0); tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(0); } diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index 88d12c1f..0033b3c6 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -89,9 +89,8 @@ class TuneXaxpy { std::vector &x_vec, std::vector &y_vec, std::vector &, std::vector &, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentInput(x_vec); tuner.AddArgumentOutput(y_vec); } diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 4b1efdef..898b8435 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -121,13 +121,11 @@ class TuneXgemm { std::vector &, std::vector &, std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - auto beta_buffer = std::vector{args.beta}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(static_cast(args.k)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(beta_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); + tuner.AddArgumentScalar(GetRealArg(args.beta)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentInput(b_mat); tuner.AddArgumentOutput(c_mat); diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index d42155ae..5c187d33 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -96,13 +96,11 @@ class TuneXgemv { std::vector &x_vec, std::vector &y_vec, std::vector &a_mat, std::vector &, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; - auto beta_buffer = std::vector{args.beta}; auto a_rotated = (V==3) ? 1 : 0; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); - tuner.AddArgumentInput(beta_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); + tuner.AddArgumentScalar(GetRealArg(args.beta)); tuner.AddArgumentScalar(static_cast(a_rotated)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index d2590c53..1fb5c531 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -85,10 +85,9 @@ class TuneXger { std::vector &x_vec, std::vector &y_vec, std::vector &a_mat, std::vector &, std::vector &, std::vector &) { - auto alpha_buffer = std::vector{args.alpha}; tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(alpha_buffer); + tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentInput(x_vec); tuner.AddArgumentScalar(0); // x_offset tuner.AddArgumentScalar(1); // x_increment diff --git a/src/utilities.cpp b/src/utilities.cpp index 68e480c5..11a6c439 100644 --- a/src/utilities.cpp +++ b/src/utilities.cpp @@ -332,6 +332,14 @@ void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_com result.Write(queue, size, result_cpu); } +// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is +// no conversion, but half-precision is not supported as kernel argument so it is converted to float. +template <> typename RealArg::Type GetRealArg(const half value) { return HalfToFloat(value); } +template <> typename RealArg::Type GetRealArg(const float value) { return value; } +template <> typename RealArg::Type GetRealArg(const double value) { return value; } +template <> typename RealArg::Type GetRealArg(const float2 value) { return value; } +template <> typename RealArg::Type GetRealArg(const double2 value) { return value; } + // ================================================================================================= // Rounding functions performing ceiling and division operations diff --git a/src/utilities.hpp b/src/utilities.hpp index d5efab9f..700d30d6 100644 --- a/src/utilities.hpp +++ b/src/utilities.hpp @@ -227,6 +227,12 @@ void FloatToHalfBuffer(std::vector& result, const std::vector& sour Buffer HalfToFloatBuffer(const Buffer& source, cl_command_queue queue_raw); void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_command_queue queue_raw); +// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is +// no conversion, but half-precision is not supported as kernel argument so it is converted to float. +template struct RealArg { using Type = T; }; +template <> struct RealArg { using Type = float; }; +template typename RealArg::Type GetRealArg(const T value); + // ================================================================================================= // Rounding functions