Merge branch 'level1_routines' into development

This commit is contained in:
cnugteren 2016-04-20 22:14:55 -06:00
commit c8e28a33c0
30 changed files with 1457 additions and 44 deletions

View file

@ -6,6 +6,8 @@ Development version (next release)
- Fixed the use of events within the library - Fixed the use of events within the library
- Added level-1 routines: - Added level-1 routines:
* SNRM2/DNRM2/ScNRM2/DzNRM2 * SNRM2/DNRM2/ScNRM2/DzNRM2
* SASUM/DASUM/ScASUM/DzASUM
* iSAMAX/iDAMAX/iCAMAX/iZAMAX
Version 0.6.0 Version 0.6.0
- Added support for MSVC (Visual Studio) 2015 - Added support for MSVC (Visual Studio) 2015

View file

@ -121,7 +121,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv) set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sgemm) set(SAMPLE_PROGRAMS_C sgemm)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)

View file

@ -185,8 +185,8 @@ CLBlast is in active development but already supports almost all the BLAS routin
| xDOTU | - | - | ✔ | ✔ | | | xDOTU | - | - | ✔ | ✔ | |
| xDOTC | - | - | ✔ | ✔ | | | xDOTC | - | - | ✔ | ✔ | |
| xNRM2 | ✔ | ✔ | ✔ | ✔ | | | xNRM2 | ✔ | ✔ | ✔ | ✔ | |
| xASUM | | | | | | | xASUM | ✔ | ✔ | ✔ | ✔ | |
| IxAMAX | | | | | | | IxAMAX | ✔ | ✔ | ✔ | ✔ | |
| Level-2 | S | D | C | Z | Notes | | Level-2 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------| | ---------|---|---|---|---|---------|

View file

@ -181,6 +181,20 @@ StatusCode Nrm2(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr); cl_command_queue* queue, cl_event* event = nullptr);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
template <typename T>
StatusCode Asum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
template <typename T>
StatusCode Amax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// ================================================================================================= // =================================================================================================
// BLAS level-2 (matrix-vector) routines // BLAS level-2 (matrix-vector) routines
// ================================================================================================= // =================================================================================================

View file

@ -278,6 +278,42 @@ StatusCode PUBLIC_API CLBlastDznrm2(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event); cl_command_queue* queue, cl_event* event);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
StatusCode PUBLIC_API CLBlastSasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastDasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastScasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastDzasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
StatusCode PUBLIC_API CLBlastiSamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastiDamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastiCamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastiZamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// ================================================================================================= // =================================================================================================
// BLAS level-2 (matrix-vector) routines // BLAS level-2 (matrix-vector) routines
// ================================================================================================= // =================================================================================================

View file

@ -0,0 +1,56 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xamax routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XAMAX_H_
#define CLBLAST_ROUTINES_XAMAX_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xamax: public Routine<T> {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorDot;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
// Templated-precision implementation of the routine
StatusCode DoAmax(const size_t n,
const Buffer<T> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XAMAX_H_
#endif

View file

@ -0,0 +1,56 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xasum routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XASUM_H_
#define CLBLAST_ROUTINES_XASUM_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xasum: public Routine<T> {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorDot;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
// Templated-precision implementation of the routine
StatusCode DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XASUM_H_
#endif

View file

@ -65,6 +65,8 @@ constexpr auto kArgCOffset = "offc";
constexpr auto kArgAPOffset = "offap"; constexpr auto kArgAPOffset = "offap";
constexpr auto kArgDotOffset = "offdot"; constexpr auto kArgDotOffset = "offdot";
constexpr auto kArgNrm2Offset = "offnrm2"; constexpr auto kArgNrm2Offset = "offnrm2";
constexpr auto kArgAsumOffset = "offasum";
constexpr auto kArgImaxOffset = "offimax";
constexpr auto kArgAlpha = "alpha"; constexpr auto kArgAlpha = "alpha";
constexpr auto kArgBeta = "beta"; constexpr auto kArgBeta = "beta";
@ -119,6 +121,8 @@ struct Arguments {
size_t ap_offset = 0; size_t ap_offset = 0;
size_t dot_offset = 0; size_t dot_offset = 0;
size_t nrm2_offset = 0; size_t nrm2_offset = 0;
size_t asum_offset = 0;
size_t imax_offset = 0;
T alpha = T{1.0}; T alpha = T{1.0};
T beta = T{1.0}; T beta = T{1.0};
size_t x_size = 1; size_t x_size = 1;

View file

@ -40,6 +40,10 @@ Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6
# Special cases # Special cases
Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output
Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output
iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output
iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output
iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output
iZ = DataType("Z", "iZ", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # As Z, but with integer output
Css = DataType("C", "C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S Css = DataType("C", "C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S
Zdd = DataType("Z", "Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D Zdd = DataType("Z", "Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D
Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S
@ -67,6 +71,8 @@ routines = [
Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors"), Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors"),
Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"),
Routine(True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), Routine(True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"),
Routine(True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"),
Routine(True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maxium value in a vector"),
], ],
[ # Level 2: matrix-vector [ # Level 2: matrix-vector
Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"),
@ -288,7 +294,7 @@ files = [
path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_clblas.h",
path_clblast+"/test/wrapper_cblas.h", path_clblast+"/test/wrapper_cblas.h",
] ]
header_lines = [84, 65, 93, 22, 22, 38] header_lines = [84, 67, 93, 22, 22, 38]
footer_lines = [6, 3, 9, 2, 6, 6] footer_lines = [6, 3, 9, 2, 6, 6]
# Checks whether the command-line arguments are valid; exists otherwise # Checks whether the command-line arguments are valid; exists otherwise
@ -368,7 +374,7 @@ for level in [1,2,3]:
body += "using double2 = clblast::double2;\n\n" body += "using double2 = clblast::double2;\n\n"
body += "// Main function (not within the clblast namespace)\n" body += "// Main function (not within the clblast namespace)\n"
body += "int main(int argc, char *argv[]) {\n" body += "int main(int argc, char *argv[]) {\n"
default = PrecisionToFullName(routine.flavours[0].name) default = PrecisionToFullName(routine.flavours[0].precision_name)
body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n" body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
for precision in ["H","S","D","C","Z"]: for precision in ["H","S","D","C","Z"]:
body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":" body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":"

View file

@ -72,7 +72,7 @@ class Routine():
# List of scalar buffers # List of scalar buffers
def ScalarBuffersFirst(self): def ScalarBuffersFirst(self):
return ["dot","nrm2"] return ["dot","nrm2","asum","imax"]
def ScalarBuffersSecond(self): def ScalarBuffersSecond(self):
return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"]

View file

@ -27,6 +27,8 @@
#include "internal/routines/level1/xdotu.h" #include "internal/routines/level1/xdotu.h"
#include "internal/routines/level1/xdotc.h" #include "internal/routines/level1/xdotc.h"
#include "internal/routines/level1/xnrm2.h" #include "internal/routines/level1/xnrm2.h"
#include "internal/routines/level1/xasum.h"
#include "internal/routines/level1/xamax.h"
// BLAS level-2 includes // BLAS level-2 includes
#include "internal/routines/level2/xgemv.h" #include "internal/routines/level2/xgemv.h"
@ -396,6 +398,68 @@ template StatusCode PUBLIC_API Nrm2<double2>(const size_t,
const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*); cl_command_queue*, cl_event*);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
template <typename T>
StatusCode Asum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto routine = Xasum<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoAsum(n,
Buffer<T>(asum_buffer), asum_offset,
Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode PUBLIC_API Asum<float>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Asum<double>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Asum<float2>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Asum<double2>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
template <typename T>
StatusCode Amax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto routine = Xamax<T>(queue_cpp, event);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoAmax(n,
Buffer<T>(imax_buffer), imax_offset,
Buffer<T>(x_buffer), x_offset, x_inc);
}
template StatusCode PUBLIC_API Amax<float>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Amax<double>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Amax<float2>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Amax<double2>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// ================================================================================================= // =================================================================================================
// BLAS level-2 (matrix-vector) routines // BLAS level-2 (matrix-vector) routines
// ================================================================================================= // =================================================================================================

View file

@ -433,6 +433,90 @@ StatusCode CLBlastDznrm2(const size_t n,
return static_cast<StatusCode>(status); return static_cast<StatusCode>(status);
} }
// ASUM
StatusCode CLBlastSasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Asum<float>(n,
asum_buffer, asum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Asum<double>(n,
asum_buffer, asum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastScasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Asum<float2>(n,
asum_buffer, asum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDzasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Asum<double2>(n,
asum_buffer, asum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// AMAX
StatusCode CLBlastiSamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Amax<float>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiDamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Amax<double>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiCamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Amax<float2>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiZamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Amax<double2>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// ================================================================================================= // =================================================================================================
// BLAS level-2 (matrix-vector) routines // BLAS level-2 (matrix-vector) routines
// ================================================================================================= // =================================================================================================

View file

@ -80,6 +80,15 @@ R"(
#define ONE 1.0 #define ONE 1.0
#endif #endif
// Single-element version of a complex number
#if PRECISION == 3232
typedef float singlereal;
#elif PRECISION == 6464
typedef double singlereal;
#else
typedef real singlereal;
#endif
// ================================================================================================= // =================================================================================================
// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
@ -109,6 +118,13 @@ R"(
#define SetToOne(a) a = ONE #define SetToOne(a) a = ONE
#endif #endif
// The absolute value (component-wise)
#if PRECISION == 3232 || PRECISION == 6464
#define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)
#else
#define AbsoluteValue(value) value = fabs(value)
#endif
// Adds two complex variables // Adds two complex variables
#if PRECISION == 3232 || PRECISION == 6464 #if PRECISION == 3232 || PRECISION == 6464
#define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y

View file

@ -0,0 +1,128 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xamax kernel. It implements an index of absolute max computation using
// reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is
// loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
// is executed with a single workgroup only, computing the final result.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef WGS1
#define WGS1 64 // The local work-group size of the main kernel
#endif
#ifndef WGS2
#define WGS2 64 // The local work-group size of the epilogue kernel
#endif
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xamax(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global singlereal* maxgm, __global unsigned int* imaxgm) {
__local singlereal maxlm[WGS1];
__local unsigned int imaxlm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
const int num_groups = get_num_groups(0);
// Performs loading and the first steps of the reduction
singlereal max = ZERO;
unsigned int imax = 0;
int id = wgid*WGS1 + lid;
while (id < n) {
#if PRECISION == 3232 || PRECISION == 6464
singlereal x = fabs(xgm[id*x_inc + x_offset].x);
#else
singlereal x = fabs(xgm[id*x_inc + x_offset]);
#endif
if (x >= max) {
max = x;
imax = id*x_inc + x_offset;
}
id += WGS1*num_groups;
}
maxlm[lid] = max;
imaxlm[lid] = imax;
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
#pragma unroll
for (int s=WGS1/2; s>0; s=s>>1) {
if (lid < s) {
if (maxlm[lid + s] >= maxlm[lid]) {
maxlm[lid] = maxlm[lid + s];
imaxlm[lid] = imaxlm[lid + s];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the per-workgroup result
if (lid == 0) {
maxgm[wgid] = maxlm[0];
imaxgm[wgid] = imaxlm[0];
}
}
// =================================================================================================
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
const __global unsigned int* restrict imaxgm,
__global unsigned int* imax, const int imax_offset) {
__local singlereal maxlm[WGS2];
__local unsigned int imaxlm[WGS2];
const int lid = get_local_id(0);
// Performs the first step of the reduction while loading the data
if (maxgm[lid + WGS2] >= maxgm[lid]) {
maxlm[lid] = maxgm[lid + WGS2];
imaxlm[lid] = imaxgm[lid + WGS2];
}
else {
maxlm[lid] = maxgm[lid];
imaxlm[lid] = imaxgm[lid];
}
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
#pragma unroll
for (int s=WGS2/2; s>0; s=s>>1) {
if (lid < s) {
if (maxlm[lid + s] >= maxlm[lid]) {
maxlm[lid] = maxlm[lid + s];
imaxlm[lid] = imaxlm[lid + s];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the final result
if (lid == 0) {
imax[imax_offset] = imaxlm[0];
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -0,0 +1,108 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xasum kernel. It implements a absolute sum computation using reduction
// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is loaded,
// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
// is executed with a single workgroup only, computing the final result.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef WGS1
#define WGS1 64 // The local work-group size of the main kernel
#endif
#ifndef WGS2
#define WGS2 64 // The local work-group size of the epilogue kernel
#endif
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xasum(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
__local real lm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
const int num_groups = get_num_groups(0);
// Performs loading and the first steps of the reduction
real acc;
SetToZero(acc);
int id = wgid*WGS1 + lid;
while (id < n) {
real x = xgm[id*x_inc + x_offset];
AbsoluteValue(x);
Add(acc, acc, x);
id += WGS1*num_groups;
}
lm[lid] = acc;
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
#pragma unroll
for (int s=WGS1/2; s>0; s=s>>1) {
if (lid < s) {
Add(lm[lid], lm[lid], lm[lid + s]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the per-workgroup result
if (lid == 0) {
output[wgid] = lm[0];
}
}
// =================================================================================================
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XasumEpilogue(const __global real* restrict input,
__global real* asum, const int asum_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);
// Performs the first step of the reduction while loading the data
Add(lm[lid], input[lid], input[lid + WGS2]);
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
#pragma unroll
for (int s=WGS2/2; s>0; s=s>>1) {
if (lid < s) {
Add(lm[lid], lm[lid], lm[lid + s]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Computes the absolute value and stores the final result
if (lid == 0) {
#if PRECISION == 3232 || PRECISION == 6464
asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number
#else
asum[asum_offset] = lm[0];
#endif
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -7,9 +7,9 @@
// Author(s): // Author(s):
// Cedric Nugteren <www.cedricnugteren.nl> // Cedric Nugteren <www.cedricnugteren.nl>
// //
// This file contains the Xnrm2 kernel. It implements a dot-product computation using reduction // This file contains the Xnrm2 kernel. It implements a squared norm computation using reduction
// kernels. Reduction is split in two parts. In the first (main) kernel the X and Y vectors are // kernels. Reduction is split in two parts. In the first (main) kernel the X vector is squared,
// multiplied, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel // followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
// is executed with a single workgroup only, computing the final result. // is executed with a single workgroup only, computing the final result.
// //
// ================================================================================================= // =================================================================================================
@ -29,7 +29,7 @@ R"(
// ================================================================================================= // =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the sum operation // The main reduction kernel, performing the multiplication and the majority of the operation
__attribute__((reqd_work_group_size(WGS1, 1, 1))) __attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xnrm2(const int n, __kernel void Xnrm2(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict xgm, const int x_offset, const int x_inc,
@ -70,7 +70,7 @@ __kernel void Xnrm2(const int n,
// ================================================================================================= // =================================================================================================
// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only. // be launched with a single workgroup only.
__attribute__((reqd_work_group_size(WGS2, 1, 1))) __attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void Xnrm2Epilogue(const __global real* restrict input, __kernel void Xnrm2Epilogue(const __global real* restrict input,

View file

@ -0,0 +1,112 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xamax class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level1/xamax.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xamax<float>::precision_ = Precision::kSingle;
template <> const Precision Xamax<double>::precision_ = Precision::kDouble;
template <> const Precision Xamax<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xamax<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
source_string_ =
#include "../../kernels/level1/xamax.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xamax<T>::DoAmax(const size_t n,
const Buffer<T> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestVectorDot(1, imax_buffer, imax_offset, sizeof(T));
if (ErrorIn(status)) { return status; }
// Retrieves the Xamax kernels from the compiled binary
try {
auto& program = GetProgramFromCache();
auto kernel1 = Kernel(program, "Xamax");
auto kernel2 = Kernel(program, "XamaxEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer1 = Buffer<T>(context_, temp_size);
auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer1());
kernel1.SetArgument(5, temp_buffer2());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer1());
kernel2.SetArgument(1, temp_buffer2());
kernel2.SetArgument(2, imax_buffer());
kernel2.SetArgument(3, static_cast<int>(imax_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
// Compiles the templated class
template class Xamax<float>;
template class Xamax<double>;
template class Xamax<float2>;
template class Xamax<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,109 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xasum class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level1/xasum.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xasum<float>::precision_ = Precision::kSingle;
template <> const Precision Xasum<double>::precision_ = Precision::kDouble;
template <> const Precision Xasum<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xasum<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
source_string_ =
#include "../../kernels/level1/xasum.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xasum<T>::DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestVectorDot(1, asum_buffer, asum_offset, sizeof(T));
if (ErrorIn(status)) { return status; }
// Retrieves the Xasum kernels from the compiled binary
try {
auto& program = GetProgramFromCache();
auto kernel1 = Kernel(program, "Xasum");
auto kernel2 = Kernel(program, "XasumEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, asum_buffer());
kernel2.SetArgument(2, static_cast<int>(asum_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
// Compiles the templated class
template class Xasum<float>;
template class Xasum<double>;
template class Xasum<float2>;
template class Xasum<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -69,6 +69,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
kernel1.SetArgument(2, static_cast<int>(x_offset)); kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc)); kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer()); kernel1.SetArgument(4, temp_buffer());
// Event waiting list // Event waiting list
auto eventWaitList = std::vector<Event>(); auto eventWaitList = std::vector<Event>();

View file

@ -22,13 +22,13 @@ namespace clblast {
// ================================================================================================= // =================================================================================================
// See comment at top of file for a description of the class // See comment at top of file for a description of the class
template <typename T> template <typename T, int V>
class TuneXdot { class TuneXdot {
public: public:
// The representative kernel and the source code // The representative kernel and the source code
static std::string KernelFamily() { return "xdot"; } static std::string KernelFamily() { return "xdot_"+std::to_string(V); }
static std::string KernelName() { return "Xdot"; } static std::string KernelName() { return (V==1) ? "Xdot" : "XdotEpilogue"; }
static std::string GetSources() { static std::string GetSources() {
return return
#include "../src/kernels/common.opencl" #include "../src/kernels/common.opencl"
@ -44,7 +44,7 @@ class TuneXdot {
// Sets the default values for the arguments // Sets the default values for the arguments
static size_t DefaultM() { return 1; } // N/A for this kernel static size_t DefaultM() { return 1; } // N/A for this kernel
static size_t DefaultN() { return 4096*1024; } static size_t DefaultN() { return 64*1024*1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel
@ -58,9 +58,7 @@ class TuneXdot {
// Sets the tuning parameters and their possible values // Sets the tuning parameters and their possible values
static void SetParameters(cltune::Tuner &tuner, const size_t id) { static void SetParameters(cltune::Tuner &tuner, const size_t id) {
tuner.AddParameter(id, "WGS1", {32, 64, 128, 256, 512, 1024}); tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024});
tuner.AddParameter(id, "WGS2", {32, 64, 128, 256, 512, 1024});
tuner.AddParameter(id, "VW", {1});
} }
// Sets the constraints and local memory size // Sets the constraints and local memory size
@ -68,16 +66,16 @@ class TuneXdot {
static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
// Sets the base thread configuration // Sets the base thread configuration
static std::vector<size_t> GlobalSize(const Arguments<T> &) { return {2}; } static std::vector<size_t> GlobalSize(const Arguments<T> &) { return (V==1) ? std::vector<size_t>{2*64} : std::vector<size_t>{1}; }
static std::vector<size_t> GlobalSizeRef(const Arguments<T> &) { return {2*64*64}; } static std::vector<size_t> GlobalSizeRef(const Arguments<T> &) { return (V==1) ? std::vector<size_t>{2*64*64} : std::vector<size_t>{64}; }
static std::vector<size_t> LocalSize() { return {1}; } static std::vector<size_t> LocalSize() { return {1}; }
static std::vector<size_t> LocalSizeRef() { return {64}; } static std::vector<size_t> LocalSizeRef() { return {64}; }
// Transforms the thread configuration based on the parameters // Transforms the thread configuration based on the parameters
using TransformVector = std::vector<std::vector<std::string>>; using TransformVector = std::vector<std::vector<std::string>>;
static TransformVector MulLocal() { return {{"WGS1"}}; } static TransformVector MulLocal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; }
static TransformVector DivLocal() { return {}; } static TransformVector DivLocal() { return {}; }
static TransformVector MulGlobal() { return {{"WGS1"},{"WGS2"}}; } static TransformVector MulGlobal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; }
static TransformVector DivGlobal() { return {}; } static TransformVector DivGlobal() { return {}; }
// Sets the kernel's arguments // Sets the kernel's arguments
@ -85,6 +83,7 @@ class TuneXdot {
std::vector<T> &x_vec, std::vector<T> &y_vec, std::vector<T> &x_vec, std::vector<T> &y_vec,
std::vector<T> &, std::vector<T> &, std::vector<T> &, std::vector<T> &, std::vector<T> &, std::vector<T> &,
std::vector<T> &temp) { std::vector<T> &temp) {
if (V == 1) {
tuner.AddArgumentScalar(static_cast<int>(args.n)); tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentInput(x_vec); tuner.AddArgumentInput(x_vec);
tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(0);
@ -95,12 +94,18 @@ class TuneXdot {
tuner.AddArgumentInput(temp); // No output checking for the result - size varies tuner.AddArgumentInput(temp); // No output checking for the result - size varies
tuner.AddArgumentScalar(static_cast<int>(false)); tuner.AddArgumentScalar(static_cast<int>(false));
} }
else {
tuner.AddArgumentInput(temp);
tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere
tuner.AddArgumentScalar(0);
}
}
// Describes how to compute the performance metrics // Describes how to compute the performance metrics
static size_t GetMetric(const Arguments<T> &args) { static size_t GetMetric(const Arguments<T> &args) {
return (2*args.n + 1) * GetBytes(args.precision); return (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision);
} }
static std::string PerformanceUnit() { return "GB/s"; } static std::string PerformanceUnit() { return (V==1) ? "GB/s" : "N/A"; }
}; };
// ================================================================================================= // =================================================================================================
@ -110,15 +115,22 @@ class TuneXdot {
using float2 = clblast::float2; using float2 = clblast::float2;
using double2 = clblast::double2; using double2 = clblast::double2;
// Main function (not within the clblast namespace) // Function to tune a specific variation V (not within the clblast namespace)
int main(int argc, char *argv[]) { template <int V>
void StartVariation(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) { switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXdot<float>, float>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXdot<float, V>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXdot<double>, double>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXdot<double, V>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXdot<float2>, float2>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXdot<float2, V>, float2>(argc, argv); break;
case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXdot<double2>, double2>(argc, argv); break; case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXdot<double2, V>, double2>(argc, argv); break;
} }
}
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
StartVariation<1>(argc, argv);
StartVariation<2>(argc, argv);
return 0; return 0;
} }

View file

@ -0,0 +1,28 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level1/xamax.h"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXamax<float>, float, float>(argc, argv, false, "iSAMAX");
clblast::RunTests<clblast::TestXamax<double>, double, double>(argc, argv, true, "iDAMAX");
clblast::RunTests<clblast::TestXamax<float2>, float2, float2>(argc, argv, true, "iCAMAX");
clblast::RunTests<clblast::TestXamax<double2>, double2, double2>(argc, argv, true, "iZAMAX");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,28 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level1/xasum.h"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXasum<float>, float, float>(argc, argv, false, "SASUM");
clblast::RunTests<clblast::TestXasum<double>, double, double>(argc, argv, true, "DASUM");
clblast::RunTests<clblast::TestXasum<float2>, float2, float2>(argc, argv, true, "ScASUM");
clblast::RunTests<clblast::TestXasum<double2>, double2, double2>(argc, argv, true, "DzASUM");
return 0;
}
// =================================================================================================

View file

@ -153,6 +153,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
auto ap_offsets = std::vector<size_t>{args.ap_offset}; auto ap_offsets = std::vector<size_t>{args.ap_offset};
auto dot_offsets = std::vector<size_t>{args.dot_offset}; auto dot_offsets = std::vector<size_t>{args.dot_offset};
auto nrm2_offsets = std::vector<size_t>{args.nrm2_offset}; auto nrm2_offsets = std::vector<size_t>{args.nrm2_offset};
auto asum_offsets = std::vector<size_t>{args.asum_offset};
auto imax_offsets = std::vector<size_t>{args.imax_offset};
auto alphas = std::vector<U>{args.alpha}; auto alphas = std::vector<U>{args.alpha};
auto betas = std::vector<U>{args.beta}; auto betas = std::vector<U>{args.beta};
auto x_sizes = std::vector<size_t>{args.x_size}; auto x_sizes = std::vector<size_t>{args.x_size};
@ -193,6 +195,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
if (option == kArgAPOffset) { ap_offsets = tester.kOffsets; } if (option == kArgAPOffset) { ap_offsets = tester.kOffsets; }
if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; } if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; }
if (option == kArgNrm2Offset) { nrm2_offsets = tester.kOffsets; } if (option == kArgNrm2Offset) { nrm2_offsets = tester.kOffsets; }
if (option == kArgAsumOffset) { asum_offsets = tester.kOffsets; }
if (option == kArgImaxOffset) { imax_offsets = tester.kOffsets; }
if (option == kArgAlpha) { alphas = tester.kAlphaValues; } if (option == kArgAlpha) { alphas = tester.kAlphaValues; }
if (option == kArgBeta) { betas = tester.kBetaValues; } if (option == kArgBeta) { betas = tester.kBetaValues; }
@ -233,6 +237,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
for (auto &ap_offset: ap_offsets) { r_args.ap_offset = ap_offset; for (auto &ap_offset: ap_offsets) { r_args.ap_offset = ap_offset;
for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset;
for (auto &nrm2_offset: nrm2_offsets) { r_args.nrm2_offset = nrm2_offset; for (auto &nrm2_offset: nrm2_offsets) { r_args.nrm2_offset = nrm2_offset;
for (auto &asum_offset: asum_offsets) { r_args.asum_offset = asum_offset;
for (auto &imax_offset: imax_offsets) { r_args.imax_offset = imax_offset;
for (auto &alpha: alphas) { r_args.alpha = alpha; for (auto &alpha: alphas) { r_args.alpha = alpha;
for (auto &beta: betas) { r_args.beta = beta; for (auto &beta: betas) { r_args.beta = beta;
C::SetSizes(r_args); C::SetSizes(r_args);
@ -257,6 +263,8 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
} }
} }
} }
}
}
// Creates the arguments vector for the invalid-buffer tests // Creates the arguments vector for the invalid-buffer tests
#ifdef CLBLAST_REF_CLBLAS #ifdef CLBLAST_REF_CLBLAS

View file

@ -80,8 +80,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); } if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }
if (o == kArgAPOffset) { args.ap_offset= GetArgument(argc, argv, help, kArgAPOffset, size_t{0}); } if (o == kArgAPOffset) { args.ap_offset= GetArgument(argc, argv, help, kArgAPOffset, size_t{0}); }
// Dot arguments // Scalar result arguments
if (o == kArgDotOffset) { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); } if (o == kArgDotOffset) { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); }
if (o == kArgNrm2Offset) { args.nrm2_offset = GetArgument(argc, argv, help, kArgNrm2Offset, size_t{0}); }
if (o == kArgAsumOffset) { args.asum_offset = GetArgument(argc, argv, help, kArgAsumOffset, size_t{0}); }
if (o == kArgImaxOffset) { args.imax_offset = GetArgument(argc, argv, help, kArgImaxOffset, size_t{0}); }
// Scalar values // Scalar values
if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<U>()); } if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<U>()); }
@ -292,6 +295,8 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args,
else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); } else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); }
else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); }
else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); }
else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); }
else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); }
} }
auto strings = std::vector<std::string>{}; auto strings = std::vector<std::string>{};
for (auto &o: options_) { for (auto &o: options_) {

View file

@ -0,0 +1,35 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "performance/client.h"
#include "routines/level1/xamax.h"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXamax<float>, float, float>(argc, argv); break;
case clblast::Precision::kDouble:
clblast::RunClient<clblast::TestXamax<double>, double, double>(argc, argv); break;
case clblast::Precision::kComplexSingle:
clblast::RunClient<clblast::TestXamax<float2>, float2, float2>(argc, argv); break;
case clblast::Precision::kComplexDouble:
clblast::RunClient<clblast::TestXamax<double2>, double2, double2>(argc, argv); break;
}
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,35 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "performance/client.h"
#include "routines/level1/xasum.h"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXasum<float>, float, float>(argc, argv); break;
case clblast::Precision::kDouble:
clblast::RunClient<clblast::TestXasum<double>, double, double>(argc, argv); break;
case clblast::Precision::kComplexSingle:
clblast::RunClient<clblast::TestXasum<float2>, float2, float2>(argc, argv); break;
case clblast::Precision::kComplexDouble:
clblast::RunClient<clblast::TestXasum<double2>, double2, double2>(argc, argv); break;
}
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,139 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements a class with static methods to describe the Xamax routine. Examples of
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
// static methods are used by the correctness tester and the performance tester.
//
// =================================================================================================
#ifndef CLBLAST_TEST_ROUTINES_XAMAX_H_
#define CLBLAST_TEST_ROUTINES_XAMAX_H_
#include <vector>
#include <string>
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestXamax {
public:
// The BLAS level: 1, 2, or 3
static size_t BLASLevel() { return 1; }
// The list of arguments relevant for this routine
static std::vector<std::string> GetOptions() {
return {kArgN,
kArgXInc,
kArgXOffset, kArgImaxOffset};
}
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) {
return args.n * args.x_inc + args.x_offset;
}
static size_t GetSizeImax(const Arguments<T> &args) {
return 1 + args.imax_offset;
}
// Describes how to set the sizes of all the buffers
static void SetSizes(Arguments<T> &args) {
args.x_size = GetSizeX(args);
args.scalar_size = GetSizeImax(args);
}
// Describes what the default values of the leading dimensions of the matrices are
static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
// Describes which transpose options are relevant for this routine
using Transposes = std::vector<Transpose>;
static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Amax<T>(args.n,
buffers.scalar(), args.imax_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
clWaitForEvents(1, &event);
return status;
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXamax<T>(args.n,
buffers.scalar(), args.imax_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXamax(args.n,
scalar_cpu, args.imax_offset,
x_vec_cpu, args.x_offset, args.x_inc);
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> result(args.scalar_size, static_cast<T>(0));
buffers.scalar.Read(queue, args.scalar_size, result);
return result;
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t GetResultIndex(const Arguments<T> &args, const size_t, const size_t) {
return args.imax_offset;
}
// Describes how to compute performance metrics
static size_t GetFlops(const Arguments<T> &args) {
return args.n;
}
static size_t GetBytes(const Arguments<T> &args) {
return ((args.n) + 1) * sizeof(T);
}
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_ROUTINES_XAMAX_H_
#endif

View file

@ -0,0 +1,139 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements a class with static methods to describe the Xasum routine. Examples of
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
// static methods are used by the correctness tester and the performance tester.
//
// =================================================================================================
#ifndef CLBLAST_TEST_ROUTINES_XASUM_H_
#define CLBLAST_TEST_ROUTINES_XASUM_H_
#include <vector>
#include <string>
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestXasum {
public:
// The BLAS level: 1, 2, or 3
static size_t BLASLevel() { return 1; }
// The list of arguments relevant for this routine
static std::vector<std::string> GetOptions() {
return {kArgN,
kArgXInc,
kArgXOffset, kArgAsumOffset};
}
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) {
return args.n * args.x_inc + args.x_offset;
}
static size_t GetSizeAsum(const Arguments<T> &args) {
return 1 + args.asum_offset;
}
// Describes how to set the sizes of all the buffers
static void SetSizes(Arguments<T> &args) {
args.x_size = GetSizeX(args);
args.scalar_size = GetSizeAsum(args);
}
// Describes what the default values of the leading dimensions of the matrices are
static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
// Describes which transpose options are relevant for this routine
using Transposes = std::vector<Transpose>;
static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Asum<T>(args.n,
buffers.scalar(), args.asum_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
clWaitForEvents(1, &event);
return status;
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXasum<T>(args.n,
buffers.scalar(), args.asum_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXasum(args.n,
scalar_cpu, args.asum_offset,
x_vec_cpu, args.x_offset, args.x_inc);
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> result(args.scalar_size, static_cast<T>(0));
buffers.scalar.Read(queue, args.scalar_size, result);
return result;
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t GetResultIndex(const Arguments<T> &args, const size_t, const size_t) {
return args.asum_offset;
}
// Describes how to compute performance metrics
static size_t GetFlops(const Arguments<T> &args) {
return args.n;
}
static size_t GetBytes(const Arguments<T> &args) {
return ((args.n) + 1) * sizeof(T);
}
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_ROUTINES_XASUM_H_
#endif

View file

@ -345,6 +345,58 @@ void cblasXnrm2(const size_t n,
reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)); reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
} }
// Forwards the Netlib BLAS calls for SASUM/DASUM/ScASUM/DzASUM
void cblasXasum(const size_t n,
std::vector<float>& asum_buffer, const size_t asum_offset,
const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
asum_buffer[asum_offset] = cblas_sasum(n,
&x_buffer[x_offset], static_cast<int>(x_inc));
}
void cblasXasum(const size_t n,
std::vector<double>& asum_buffer, const size_t asum_offset,
const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
asum_buffer[asum_offset] = cblas_dasum(n,
&x_buffer[x_offset], static_cast<int>(x_inc));
}
void cblasXasum(const size_t n,
std::vector<float2>& asum_buffer, const size_t asum_offset,
const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
asum_buffer[asum_offset] = cblas_scasum(n,
reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
}
void cblasXasum(const size_t n,
std::vector<double2>& asum_buffer, const size_t asum_offset,
const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
asum_buffer[asum_offset] = cblas_dzasum(n,
reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
}
// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX
void cblasXamax(const size_t n,
std::vector<float>& imax_buffer, const size_t imax_offset,
const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
imax_buffer[imax_offset] = cblas_isamax(n,
&x_buffer[x_offset], static_cast<int>(x_inc));
}
void cblasXamax(const size_t n,
std::vector<double>& imax_buffer, const size_t imax_offset,
const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
imax_buffer[imax_offset] = cblas_idamax(n,
&x_buffer[x_offset], static_cast<int>(x_inc));
}
void cblasXamax(const size_t n,
std::vector<float2>& imax_buffer, const size_t imax_offset,
const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
imax_buffer[imax_offset] = cblas_icamax(n,
reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
}
void cblasXamax(const size_t n,
std::vector<double2>& imax_buffer, const size_t imax_offset,
const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
imax_buffer[imax_offset] = cblas_izamax(n,
reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
}
// ================================================================================================= // =================================================================================================
// BLAS level-2 (matrix-vector) routines // BLAS level-2 (matrix-vector) routines
// ================================================================================================= // =================================================================================================

View file

@ -558,6 +558,142 @@ clblasStatus clblasXnrm2<double2>(const size_t n,
num_queues, queues, num_wait_events, wait_events, events); num_queues, queues, num_wait_events, wait_events, events);
} }
// Forwards the clBLAS calls for SASUM/DASUM/ScASUM/DzASUM
template <typename T>
clblasStatus clblasXasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
template <>
clblasStatus clblasXasum<float>(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
auto scratch_buffer = Buffer<float>(context, n);
return clblasSasum(n,
asum_buffer, asum_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
scratch_buffer(),
num_queues, queues, num_wait_events, wait_events, events);
}
template <>
clblasStatus clblasXasum<double>(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
auto scratch_buffer = Buffer<double>(context, n);
return clblasDasum(n,
asum_buffer, asum_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
scratch_buffer(),
num_queues, queues, num_wait_events, wait_events, events);
}
template <>
clblasStatus clblasXasum<float2>(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
auto scratch_buffer = Buffer<float2>(context, n);
return clblasScasum(n,
asum_buffer, asum_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
scratch_buffer(),
num_queues, queues, num_wait_events, wait_events, events);
}
template <>
clblasStatus clblasXasum<double2>(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
auto scratch_buffer = Buffer<double2>(context, n);
return clblasDzasum(n,
asum_buffer, asum_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
scratch_buffer(),
num_queues, queues, num_wait_events, wait_events, events);
}
// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX
template <typename T>
clblasStatus clblasXamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
template <>
clblasStatus clblasXamax<float>(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
auto scratch_buffer = Buffer<float>(context, 2*n);
return clblasiSamax(n,
imax_buffer, imax_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
scratch_buffer(),
num_queues, queues, num_wait_events, wait_events, events);
}
template <>
clblasStatus clblasXamax<double>(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
auto scratch_buffer = Buffer<double>(context, 2*n);
return clblasiDamax(n,
imax_buffer, imax_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
scratch_buffer(),
num_queues, queues, num_wait_events, wait_events, events);
}
template <>
clblasStatus clblasXamax<float2>(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
auto scratch_buffer = Buffer<float2>(context, 2*n);
return clblasiCamax(n,
imax_buffer, imax_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
scratch_buffer(),
num_queues, queues, num_wait_events, wait_events, events);
}
template <>
clblasStatus clblasXamax<double2>(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto queue = Queue(queues[0]);
auto context = queue.GetContext();
auto scratch_buffer = Buffer<double2>(context, 2*n);
return clblasiZamax(n,
imax_buffer, imax_offset,
x_buffer, x_offset, static_cast<int>(x_inc),
scratch_buffer(),
num_queues, queues, num_wait_events, wait_events, events);
}
// ================================================================================================= // =================================================================================================
// BLAS level-2 (matrix-vector) routines // BLAS level-2 (matrix-vector) routines
// ================================================================================================= // =================================================================================================