diff --git a/CHANGELOG b/CHANGELOG index db14f037..c9770dc2 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,8 @@ Development version (next release) - Fixed the use of events within the library - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 + * SASUM/DASUM/ScASUM/DzASUM + * iSAMAX/iDAMAX/iCAMAX/iZAMAX Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 diff --git a/CMakeLists.txt b/CMakeLists.txt index a4eb5b85..efdf6be0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,7 +121,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS}) set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_C sgemm) -set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2) +set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) diff --git a/README.md b/README.md index 74d8c9cc..b4f0981f 100644 --- a/README.md +++ b/README.md @@ -185,8 +185,8 @@ CLBlast is in active development but already supports almost all the BLAS routin | xDOTU | - | - | ✔ | ✔ | | | xDOTC | - | - | ✔ | ✔ | | | xNRM2 | ✔ | ✔ | ✔ | ✔ | | -| xASUM | | | | | | -| IxAMAX | | | | | | +| xASUM | ✔ | ✔ | ✔ | ✔ | | +| IxAMAX | ✔ | ✔ | ✔ | ✔ | | | Level-2 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| diff --git a/include/clblast.h b/include/clblast.h index 431f2510..57948581 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -181,6 +181,20 @@ StatusCode Nrm2(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +template +StatusCode Asum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +template +StatusCode Amax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index f72cff3a..92f4afe5 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -278,6 +278,42 @@ StatusCode PUBLIC_API CLBlastDznrm2(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +StatusCode PUBLIC_API CLBlastSasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastScasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDzasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +StatusCode PUBLIC_API CLBlastiSamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiDamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiCamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiZamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/include/internal/routines/level1/xamax.h b/include/internal/routines/level1/xamax.h new file mode 100644 index 00000000..b815e8d2 --- /dev/null +++ b/include/internal/routines/level1/xamax.h @@ -0,0 +1,56 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xamax routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XAMAX_H_ +#define CLBLAST_ROUTINES_XAMAX_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xamax: public Routine { + public: + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::event_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::TestVectorDot; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX"); + + // Templated-precision implementation of the routine + StatusCode DoAmax(const size_t n, + const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XAMAX_H_ +#endif diff --git a/include/internal/routines/level1/xasum.h b/include/internal/routines/level1/xasum.h new file mode 100644 index 00000000..b6e5d2cd --- /dev/null +++ b/include/internal/routines/level1/xasum.h @@ -0,0 +1,56 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xasum routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XASUM_H_ +#define CLBLAST_ROUTINES_XASUM_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xasum: public Routine { + public: + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::event_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::TestVectorDot; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM"); + + // Templated-precision implementation of the routine + StatusCode DoAsum(const size_t n, + const Buffer &asum_buffer, const size_t asum_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XASUM_H_ +#endif diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 6adc1d0a..75b3d27d 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -65,6 +65,8 @@ constexpr auto kArgCOffset = "offc"; constexpr auto kArgAPOffset = "offap"; constexpr auto kArgDotOffset = "offdot"; constexpr auto kArgNrm2Offset = "offnrm2"; +constexpr auto kArgAsumOffset = "offasum"; +constexpr auto kArgImaxOffset = "offimax"; constexpr auto kArgAlpha = "alpha"; constexpr auto kArgBeta = "beta"; @@ -119,6 +121,8 @@ struct Arguments { size_t ap_offset = 0; size_t dot_offset = 0; size_t nrm2_offset = 0; + size_t asum_offset = 0; + size_t imax_offset = 0; T alpha = T{1.0}; T beta = T{1.0}; size_t x_size = 1; diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 8cd35f95..d8bd4e2c 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -40,6 +40,10 @@ Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6 # Special cases Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output +iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output +iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output +iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output +iZ = DataType("Z", "iZ", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # As Z, but with integer output Css = DataType("C", "C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S Zdd = DataType("Z", "Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S @@ -67,6 +71,8 @@ routines = [ Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors"), Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), Routine(True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), + Routine(True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), + Routine(True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maxium value in a vector"), ], [ # Level 2: matrix-vector Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), @@ -288,7 +294,7 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 65, 93, 22, 22, 38] +header_lines = [84, 67, 93, 22, 22, 38] footer_lines = [6, 3, 9, 2, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise @@ -368,7 +374,7 @@ for level in [1,2,3]: body += "using double2 = clblast::double2;\n\n" body += "// Main function (not within the clblast namespace)\n" body += "int main(int argc, char *argv[]) {\n" - default = PrecisionToFullName(routine.flavours[0].name) + default = PrecisionToFullName(routine.flavours[0].precision_name) body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n" for precision in ["H","S","D","C","Z"]: body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":" diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index fffa19f6..9806d960 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -72,7 +72,7 @@ class Routine(): # List of scalar buffers def ScalarBuffersFirst(self): - return ["dot","nrm2"] + return ["dot","nrm2","asum","imax"] def ScalarBuffersSecond(self): return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] diff --git a/src/clblast.cc b/src/clblast.cc index 4f4b6078..145b6bf6 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -27,6 +27,8 @@ #include "internal/routines/level1/xdotu.h" #include "internal/routines/level1/xdotc.h" #include "internal/routines/level1/xnrm2.h" +#include "internal/routines/level1/xasum.h" +#include "internal/routines/level1/xamax.h" // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" @@ -396,6 +398,68 @@ template StatusCode PUBLIC_API Nrm2(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +template +StatusCode Asum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xasum(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoAsum(n, + Buffer(asum_buffer), asum_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +template +StatusCode Amax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xamax(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoAmax(n, + Buffer(imax_buffer), imax_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 23e97bd5..23c96feb 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -433,6 +433,90 @@ StatusCode CLBlastDznrm2(const size_t n, return static_cast(status); } +// ASUM +StatusCode CLBlastSasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastScasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDzasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + +// AMAX +StatusCode CLBlastiSamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiDamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiCamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiZamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index f2a2e7a7..57d75ee0 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -80,6 +80,15 @@ R"( #define ONE 1.0 #endif +// Single-element version of a complex number +#if PRECISION == 3232 + typedef float singlereal; +#elif PRECISION == 6464 + typedef double singlereal; +#else + typedef real singlereal; +#endif + // ================================================================================================= // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific @@ -109,6 +118,13 @@ R"( #define SetToOne(a) a = ONE #endif +// The absolute value (component-wise) +#if PRECISION == 3232 || PRECISION == 6464 + #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y) +#else + #define AbsoluteValue(value) value = fabs(value) +#endif + // Adds two complex variables #if PRECISION == 3232 || PRECISION == 6464 #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl new file mode 100644 index 00000000..03dd05e5 --- /dev/null +++ b/src/kernels/level1/xamax.opencl @@ -0,0 +1,128 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Xamax kernel. It implements an index of absolute max computation using +// reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is +// loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel +// is executed with a single workgroup only, computing the final result. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef WGS1 + #define WGS1 64 // The local work-group size of the main kernel +#endif +#ifndef WGS2 + #define WGS2 64 // The local work-group size of the epilogue kernel +#endif + +// ================================================================================================= + +// The main reduction kernel, performing the loading and the majority of the operation +__attribute__((reqd_work_group_size(WGS1, 1, 1))) +__kernel void Xamax(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global singlereal* maxgm, __global unsigned int* imaxgm) { + __local singlereal maxlm[WGS1]; + __local unsigned int imaxlm[WGS1]; + const int lid = get_local_id(0); + const int wgid = get_group_id(0); + const int num_groups = get_num_groups(0); + + // Performs loading and the first steps of the reduction + singlereal max = ZERO; + unsigned int imax = 0; + int id = wgid*WGS1 + lid; + while (id < n) { + #if PRECISION == 3232 || PRECISION == 6464 + singlereal x = fabs(xgm[id*x_inc + x_offset].x); + #else + singlereal x = fabs(xgm[id*x_inc + x_offset]); + #endif + if (x >= max) { + max = x; + imax = id*x_inc + x_offset; + } + id += WGS1*num_groups; + } + maxlm[lid] = max; + imaxlm[lid] = imax; + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS1/2; s>0; s=s>>1) { + if (lid < s) { + if (maxlm[lid + s] >= maxlm[lid]) { + maxlm[lid] = maxlm[lid + s]; + imaxlm[lid] = imaxlm[lid + s]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the per-workgroup result + if (lid == 0) { + maxgm[wgid] = maxlm[0]; + imaxgm[wgid] = imaxlm[0]; + } +} + +// ================================================================================================= + +// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to +// be launched with a single workgroup only. +__attribute__((reqd_work_group_size(WGS2, 1, 1))) +__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm, + const __global unsigned int* restrict imaxgm, + __global unsigned int* imax, const int imax_offset) { + __local singlereal maxlm[WGS2]; + __local unsigned int imaxlm[WGS2]; + const int lid = get_local_id(0); + + // Performs the first step of the reduction while loading the data + if (maxgm[lid + WGS2] >= maxgm[lid]) { + maxlm[lid] = maxgm[lid + WGS2]; + imaxlm[lid] = imaxgm[lid + WGS2]; + } + else { + maxlm[lid] = maxgm[lid]; + imaxlm[lid] = imaxgm[lid]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS2/2; s>0; s=s>>1) { + if (lid < s) { + if (maxlm[lid + s] >= maxlm[lid]) { + maxlm[lid] = maxlm[lid + s]; + imaxlm[lid] = imaxlm[lid + s]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the final result + if (lid == 0) { + imax[imax_offset] = imaxlm[0]; + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl new file mode 100644 index 00000000..037dc57e --- /dev/null +++ b/src/kernels/level1/xasum.opencl @@ -0,0 +1,108 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Xasum kernel. It implements a absolute sum computation using reduction +// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is loaded, +// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel +// is executed with a single workgroup only, computing the final result. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef WGS1 + #define WGS1 64 // The local work-group size of the main kernel +#endif +#ifndef WGS2 + #define WGS2 64 // The local work-group size of the epilogue kernel +#endif + +// ================================================================================================= + +// The main reduction kernel, performing the loading and the majority of the operation +__attribute__((reqd_work_group_size(WGS1, 1, 1))) +__kernel void Xasum(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* output) { + __local real lm[WGS1]; + const int lid = get_local_id(0); + const int wgid = get_group_id(0); + const int num_groups = get_num_groups(0); + + // Performs loading and the first steps of the reduction + real acc; + SetToZero(acc); + int id = wgid*WGS1 + lid; + while (id < n) { + real x = xgm[id*x_inc + x_offset]; + AbsoluteValue(x); + Add(acc, acc, x); + id += WGS1*num_groups; + } + lm[lid] = acc; + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS1/2; s>0; s=s>>1) { + if (lid < s) { + Add(lm[lid], lm[lid], lm[lid + s]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the per-workgroup result + if (lid == 0) { + output[wgid] = lm[0]; + } +} + +// ================================================================================================= + +// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to +// be launched with a single workgroup only. +__attribute__((reqd_work_group_size(WGS2, 1, 1))) +__kernel void XasumEpilogue(const __global real* restrict input, + __global real* asum, const int asum_offset) { + __local real lm[WGS2]; + const int lid = get_local_id(0); + + // Performs the first step of the reduction while loading the data + Add(lm[lid], input[lid], input[lid + WGS2]); + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS2/2; s>0; s=s>>1) { + if (lid < s) { + Add(lm[lid], lm[lid], lm[lid + s]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Computes the absolute value and stores the final result + if (lid == 0) { + #if PRECISION == 3232 || PRECISION == 6464 + asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number + #else + asum[asum_offset] = lm[0]; + #endif + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl index cf579457..9803687a 100644 --- a/src/kernels/level1/xnrm2.opencl +++ b/src/kernels/level1/xnrm2.opencl @@ -7,9 +7,9 @@ // Author(s): // Cedric Nugteren // -// This file contains the Xnrm2 kernel. It implements a dot-product computation using reduction -// kernels. Reduction is split in two parts. In the first (main) kernel the X and Y vectors are -// multiplied, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel +// This file contains the Xnrm2 kernel. It implements a squared norm computation using reduction +// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is squared, +// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel // is executed with a single workgroup only, computing the final result. // // ================================================================================================= @@ -29,7 +29,7 @@ R"( // ================================================================================================= -// The main reduction kernel, performing the multiplication and the majority of the sum operation +// The main reduction kernel, performing the multiplication and the majority of the operation __attribute__((reqd_work_group_size(WGS1, 1, 1))) __kernel void Xnrm2(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, @@ -70,7 +70,7 @@ __kernel void Xnrm2(const int n, // ================================================================================================= -// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to +// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. __attribute__((reqd_work_group_size(WGS2, 1, 1))) __kernel void Xnrm2Epilogue(const __global real* restrict input, diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc new file mode 100644 index 00000000..ffdfa496 --- /dev/null +++ b/src/routines/level1/xamax.cc @@ -0,0 +1,112 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xamax class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xamax.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xamax::precision_ = Precision::kSingle; +template <> const Precision Xamax::precision_ = Precision::kDouble; +template <> const Precision Xamax::precision_ = Precision::kComplexSingle; +template <> const Precision Xamax::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xamax::Xamax(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xdot"}, precision_) { + source_string_ = + #include "../../kernels/level1/xamax.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xamax::DoAmax(const size_t n, + const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorDot(1, imax_buffer, imax_offset, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Retrieves the Xamax kernels from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel1 = Kernel(program, "Xamax"); + auto kernel2 = Kernel(program, "XamaxEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer1 = Buffer(context_, temp_size); + auto temp_buffer2 = Buffer(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast(x_offset)); + kernel1.SetArgument(3, static_cast(x_inc)); + kernel1.SetArgument(4, temp_buffer1()); + kernel1.SetArgument(5, temp_buffer2()); + + // Event waiting list + auto eventWaitList = std::vector(); + + // Launches the main kernel + auto global1 = std::vector{db_["WGS1"]*temp_size}; + auto local1 = std::vector{db_["WGS1"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer1()); + kernel2.SetArgument(1, temp_buffer2()); + kernel2.SetArgument(2, imax_buffer()); + kernel2.SetArgument(3, static_cast(imax_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector{db_["WGS2"]}; + auto local2 = std::vector{db_["WGS2"]}; + status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xamax; +template class Xamax; +template class Xamax; +template class Xamax; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc new file mode 100644 index 00000000..5799e25a --- /dev/null +++ b/src/routines/level1/xasum.cc @@ -0,0 +1,109 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xasum class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xasum.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xasum::precision_ = Precision::kSingle; +template <> const Precision Xasum::precision_ = Precision::kDouble; +template <> const Precision Xasum::precision_ = Precision::kComplexSingle; +template <> const Precision Xasum::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xasum::Xasum(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xdot"}, precision_) { + source_string_ = + #include "../../kernels/level1/xasum.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xasum::DoAsum(const size_t n, + const Buffer &asum_buffer, const size_t asum_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorDot(1, asum_buffer, asum_offset, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Retrieves the Xasum kernels from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel1 = Kernel(program, "Xasum"); + auto kernel2 = Kernel(program, "XasumEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast(x_offset)); + kernel1.SetArgument(3, static_cast(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Event waiting list + auto eventWaitList = std::vector(); + + // Launches the main kernel + auto global1 = std::vector{db_["WGS1"]*temp_size}; + auto local1 = std::vector{db_["WGS1"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, asum_buffer()); + kernel2.SetArgument(2, static_cast(asum_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector{db_["WGS2"]}; + auto local2 = std::vector{db_["WGS2"]}; + status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xasum; +template class Xasum; +template class Xasum; +template class Xasum; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index 04e4137c..ceabe586 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -69,6 +69,7 @@ StatusCode Xnrm2::DoNrm2(const size_t n, kernel1.SetArgument(2, static_cast(x_offset)); kernel1.SetArgument(3, static_cast(x_inc)); kernel1.SetArgument(4, temp_buffer()); + // Event waiting list auto eventWaitList = std::vector(); diff --git a/src/tuning/xdot.cc b/src/tuning/xdot.cc index ff6bee16..48fa800b 100644 --- a/src/tuning/xdot.cc +++ b/src/tuning/xdot.cc @@ -22,13 +22,13 @@ namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class -template +template class TuneXdot { public: // The representative kernel and the source code - static std::string KernelFamily() { return "xdot"; } - static std::string KernelName() { return "Xdot"; } + static std::string KernelFamily() { return "xdot_"+std::to_string(V); } + static std::string KernelName() { return (V==1) ? "Xdot" : "XdotEpilogue"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" @@ -44,7 +44,7 @@ class TuneXdot { // Sets the default values for the arguments static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 4096*1024; } + static size_t DefaultN() { return 64*1024*1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel @@ -58,9 +58,7 @@ class TuneXdot { // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS1", {32, 64, 128, 256, 512, 1024}); - tuner.AddParameter(id, "WGS2", {32, 64, 128, 256, 512, 1024}); - tuner.AddParameter(id, "VW", {1}); + tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}); } // Sets the constraints and local memory size @@ -68,16 +66,16 @@ class TuneXdot { static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &) { return {2}; } - static std::vector GlobalSizeRef(const Arguments &) { return {2*64*64}; } + static std::vector GlobalSize(const Arguments &) { return (V==1) ? std::vector{2*64} : std::vector{1}; } + static std::vector GlobalSizeRef(const Arguments &) { return (V==1) ? std::vector{2*64*64} : std::vector{64}; } static std::vector LocalSize() { return {1}; } static std::vector LocalSizeRef() { return {64}; } // Transforms the thread configuration based on the parameters using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS1"}}; } + static TransformVector MulLocal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {{"WGS1"},{"WGS2"}}; } + static TransformVector MulGlobal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } static TransformVector DivGlobal() { return {}; } // Sets the kernel's arguments @@ -85,22 +83,29 @@ class TuneXdot { std::vector &x_vec, std::vector &y_vec, std::vector &, std::vector &, std::vector &, std::vector &temp) { - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(y_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(temp); // No output checking for the result - size varies - tuner.AddArgumentScalar(static_cast(false)); + if (V == 1) { + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentInput(y_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentInput(temp); // No output checking for the result - size varies + tuner.AddArgumentScalar(static_cast(false)); + } + else { + tuner.AddArgumentInput(temp); + tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere + tuner.AddArgumentScalar(0); + } } // Describes how to compute the performance metrics static size_t GetMetric(const Arguments &args) { - return (2*args.n + 1) * GetBytes(args.precision); + return (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); } - static std::string PerformanceUnit() { return "GB/s"; } + static std::string PerformanceUnit() { return (V==1) ? "GB/s" : "N/A"; } }; // ================================================================================================= @@ -110,15 +115,22 @@ class TuneXdot { using float2 = clblast::float2; using double2 = clblast::double2; -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { +// Function to tune a specific variation V (not within the clblast namespace) +template +void StartVariation(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; } +} + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); return 0; } diff --git a/test/correctness/routines/level1/xamax.cc b/test/correctness/routines/level1/xamax.cc new file mode 100644 index 00000000..ade09e7a --- /dev/null +++ b/test/correctness/routines/level1/xamax.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xamax.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "iSAMAX"); + clblast::RunTests, double, double>(argc, argv, true, "iDAMAX"); + clblast::RunTests, float2, float2>(argc, argv, true, "iCAMAX"); + clblast::RunTests, double2, double2>(argc, argv, true, "iZAMAX"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xasum.cc b/test/correctness/routines/level1/xasum.cc new file mode 100644 index 00000000..5ec20596 --- /dev/null +++ b/test/correctness/routines/level1/xasum.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xasum.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SASUM"); + clblast::RunTests, double, double>(argc, argv, true, "DASUM"); + clblast::RunTests, float2, float2>(argc, argv, true, "ScASUM"); + clblast::RunTests, double2, double2>(argc, argv, true, "DzASUM"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 8181aaf6..13be921a 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -153,6 +153,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto ap_offsets = std::vector{args.ap_offset}; auto dot_offsets = std::vector{args.dot_offset}; auto nrm2_offsets = std::vector{args.nrm2_offset}; + auto asum_offsets = std::vector{args.asum_offset}; + auto imax_offsets = std::vector{args.imax_offset}; auto alphas = std::vector{args.alpha}; auto betas = std::vector{args.beta}; auto x_sizes = std::vector{args.x_size}; @@ -193,6 +195,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgAPOffset) { ap_offsets = tester.kOffsets; } if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; } if (option == kArgNrm2Offset) { nrm2_offsets = tester.kOffsets; } + if (option == kArgAsumOffset) { asum_offsets = tester.kOffsets; } + if (option == kArgImaxOffset) { imax_offsets = tester.kOffsets; } if (option == kArgAlpha) { alphas = tester.kAlphaValues; } if (option == kArgBeta) { betas = tester.kBetaValues; } @@ -233,10 +237,14 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name for (auto &ap_offset: ap_offsets) { r_args.ap_offset = ap_offset; for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; for (auto &nrm2_offset: nrm2_offsets) { r_args.nrm2_offset = nrm2_offset; - for (auto &alpha: alphas) { r_args.alpha = alpha; - for (auto &beta: betas) { r_args.beta = beta; - C::SetSizes(r_args); - regular_test_vector.push_back(r_args); + for (auto &asum_offset: asum_offsets) { r_args.asum_offset = asum_offset; + for (auto &imax_offset: imax_offsets) { r_args.imax_offset = imax_offset; + for (auto &alpha: alphas) { r_args.alpha = alpha; + for (auto &beta: betas) { r_args.beta = beta; + C::SetSizes(r_args); + regular_test_vector.push_back(r_args); + } + } } } } diff --git a/test/performance/client.cc b/test/performance/client.cc index 56ab8c8d..9aaf1e4e 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -80,8 +80,11 @@ Arguments Client::ParseArguments(int argc, char *argv[], const GetMetric if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); } if (o == kArgAPOffset) { args.ap_offset= GetArgument(argc, argv, help, kArgAPOffset, size_t{0}); } - // Dot arguments + // Scalar result arguments if (o == kArgDotOffset) { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); } + if (o == kArgNrm2Offset) { args.nrm2_offset = GetArgument(argc, argv, help, kArgNrm2Offset, size_t{0}); } + if (o == kArgAsumOffset) { args.asum_offset = GetArgument(argc, argv, help, kArgAsumOffset, size_t{0}); } + if (o == kArgImaxOffset) { args.imax_offset = GetArgument(argc, argv, help, kArgImaxOffset, size_t{0}); } // Scalar values if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar()); } @@ -292,6 +295,8 @@ void Client::PrintTableRow(const Arguments& args, else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); } else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } + else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); } + else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); } } auto strings = std::vector{}; for (auto &o: options_) { diff --git a/test/performance/routines/level1/xamax.cc b/test/performance/routines/level1/xamax.cc new file mode 100644 index 00000000..85caa483 --- /dev/null +++ b/test/performance/routines/level1/xamax.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xamax.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xasum.cc b/test/performance/routines/level1/xasum.cc new file mode 100644 index 00000000..2680966e --- /dev/null +++ b/test/performance/routines/level1/xasum.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xasum.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/routines/level1/xamax.h b/test/routines/level1/xamax.h new file mode 100644 index 00000000..7b404dc3 --- /dev/null +++ b/test/routines/level1/xamax.h @@ -0,0 +1,139 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xamax routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XAMAX_H_ +#define CLBLAST_TEST_ROUTINES_XAMAX_H_ + +#include +#include + +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXamax { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, + kArgXOffset, kArgImaxOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeImax(const Arguments &args) { + return 1 + args.imax_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.scalar_size = GetSizeImax(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Amax(args.n, + buffers.scalar(), args.imax_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXamax(args.n, + buffers.scalar(), args.imax_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXamax(args.n, + scalar_cpu, args.imax_offset, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.scalar_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { + return args.imax_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((args.n) + 1) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XAMAX_H_ +#endif diff --git a/test/routines/level1/xasum.h b/test/routines/level1/xasum.h new file mode 100644 index 00000000..6eae3c83 --- /dev/null +++ b/test/routines/level1/xasum.h @@ -0,0 +1,139 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xasum routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XASUM_H_ +#define CLBLAST_TEST_ROUTINES_XASUM_H_ + +#include +#include + +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXasum { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, + kArgXOffset, kArgAsumOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeAsum(const Arguments &args) { + return 1 + args.asum_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.scalar_size = GetSizeAsum(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Asum(args.n, + buffers.scalar(), args.asum_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXasum(args.n, + buffers.scalar(), args.asum_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXasum(args.n, + scalar_cpu, args.asum_offset, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.scalar_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { + return args.asum_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((args.n) + 1) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XASUM_H_ +#endif diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index dec272b0..994b48b1 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -345,6 +345,58 @@ void cblasXnrm2(const size_t n, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } +// Forwards the Netlib BLAS calls for SASUM/DASUM/ScASUM/DzASUM +void cblasXasum(const size_t n, + std::vector& asum_buffer, const size_t asum_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + asum_buffer[asum_offset] = cblas_sasum(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXasum(const size_t n, + std::vector& asum_buffer, const size_t asum_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + asum_buffer[asum_offset] = cblas_dasum(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXasum(const size_t n, + std::vector& asum_buffer, const size_t asum_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + asum_buffer[asum_offset] = cblas_scasum(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXasum(const size_t n, + std::vector& asum_buffer, const size_t asum_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + asum_buffer[asum_offset] = cblas_dzasum(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX +void cblasXamax(const size_t n, + std::vector& imax_buffer, const size_t imax_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + imax_buffer[imax_offset] = cblas_isamax(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXamax(const size_t n, + std::vector& imax_buffer, const size_t imax_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + imax_buffer[imax_offset] = cblas_idamax(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXamax(const size_t n, + std::vector& imax_buffer, const size_t imax_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + imax_buffer[imax_offset] = cblas_icamax(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXamax(const size_t n, + std::vector& imax_buffer, const size_t imax_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + imax_buffer[imax_offset] = cblas_izamax(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 89b708b8..a44466c6 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -558,6 +558,142 @@ clblasStatus clblasXnrm2(const size_t n, num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for SASUM/DASUM/ScASUM/DzASUM +template +clblasStatus clblasXasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasSasum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasDasum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasScasum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasDzasum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX +template +clblasStatus clblasXamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, 2*n); + return clblasiSamax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, 2*n); + return clblasiDamax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, 2*n); + return clblasiCamax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, 2*n); + return clblasiZamax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // =================================================================================================