From fda335ddf21d174101e3d72a10f2eba8b4faefcf Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 13 Mar 2016 11:09:02 +0100 Subject: [PATCH 01/60] Prepared the changelog for the next release --- CHANGELOG | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index f0648ebc..5a41f3c9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,7 @@ +Development version (next release) +- + Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 - Added tuned parameters for various devices (see README) From 918797735da29f102f027b9dc19dccc751950c76 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 14 Mar 2016 22:55:22 +0100 Subject: [PATCH 02/60] Made the library thread-safe by guarding the kernel cache with a mutex --- CHANGELOG | 2 +- include/internal/routine.h | 4 +++- src/routine.cc | 24 +++++++++++++++++++----- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 5a41f3c9..1606168c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ Development version (next release) -- +- Made the library thread-safe Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 diff --git a/include/internal/routine.h b/include/internal/routine.h index b7c06a97..5f5b8211 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -18,6 +18,7 @@ #include #include +#include #include "internal/utilities.h" #include "internal/database.h" @@ -46,8 +47,9 @@ class Routine { } }; - // The actual cache, implemented as a vector of the above data-type + // The actual cache, implemented as a vector of the above data-type, and its mutex static std::vector program_cache_; + static std::mutex program_cache_mutex_; // Helper functions which check for errors in the status code static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } diff --git a/src/routine.cc b/src/routine.cc index 2978c94a..ff7b3e1a 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -11,14 +11,18 @@ // // ================================================================================================= +#include +#include +#include + #include "internal/routine.h" namespace clblast { // ================================================================================================= -// The cache of compiled OpenCL programs -template -std::vector::ProgramCache> Routine::program_cache_; +// The cache of compiled OpenCL programs and its mutex for thread safety +template std::vector::ProgramCache> Routine::program_cache_; +template std::mutex Routine::program_cache_mutex_; // Constructor: not much here, because no status codes can be returned template @@ -97,8 +101,10 @@ StatusCode Routine::SetUp() { } if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } - // Store the compiled program in the cache + // Store the compiled program in the cache (atomic for thread-safety) + program_cache_mutex_.lock(); program_cache_.push_back({program, device_name_, precision_, routine_name_}); + program_cache_mutex_.unlock(); } catch (...) { return StatusCode::kBuildProgramFailure; } } @@ -367,20 +373,28 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t // otherwise. template const Program& Routine::GetProgramFromCache() const { + program_cache_mutex_.lock(); for (auto &cached_program: program_cache_) { if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { + program_cache_mutex_.unlock(); return cached_program.program; } } + program_cache_mutex_.unlock(); throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); } // Queries the cache to see whether or not the compiled kernel is already there template bool Routine::ProgramIsInCache() const { + program_cache_mutex_.lock(); for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; } + if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { + program_cache_mutex_.unlock(); + return true; + } } + program_cache_mutex_.unlock(); return false; } From d9356954176eba875138e38b5cf0fb131e1ac925 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 19 Mar 2016 11:09:09 +0100 Subject: [PATCH 03/60] Added __declspec(dllexport) to create a DLL on Windows --- include/internal/public_api.h | 32 + scripts/generator/generator.py | 10 +- scripts/generator/routine.py | 4 +- src/clblast.cc | 1485 ++++++++++++++++---------------- src/clblast_c.cc | 1485 ++++++++++++++++---------------- 5 files changed, 1525 insertions(+), 1491 deletions(-) create mode 100644 include/internal/public_api.h diff --git a/include/internal/public_api.h b/include/internal/public_api.h new file mode 100644 index 00000000..ed3f1e29 --- /dev/null +++ b/include/internal/public_api.h @@ -0,0 +1,32 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file provides macro's to define the public API. This is needed when building a Windows DLL. +// +// ================================================================================================= + +#ifndef CLBLAST_PUBLIC_API_H_ +#define CLBLAST_PUBLIC_API_H_ + +namespace clblast { +// ================================================================================================= + +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#ifdef _WIN32 + #define PUBLIC_API __declspec(dllexport) +#else + #define PUBLIC_API +#endif + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_PUBLIC_API_H_ +#endif \ No newline at end of file diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 99f326cd..867fe520 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -175,8 +175,8 @@ def clblast_cc(routines): result += " return StatusCode::kNotImplemented;\n" result += "}\n" for flavour in routine.flavours: - indent2 = " "*(23 + routine.Length() + len(flavour.template)) - result += "template StatusCode "+routine.name.capitalize()+"<"+flavour.template+">(" + indent2 = " "*(34 + routine.Length() + len(flavour.template)) + result += "template StatusCode PUBLIC_API "+routine.name.capitalize()+"<"+flavour.template+">(" result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)]) result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n" return result @@ -189,7 +189,7 @@ def clblast_c_h(routines): for routine in routines: result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" for flavour in routine.flavours: - result += routine.RoutineHeaderC(flavour, 20)+";\n" + result += routine.RoutineHeaderC(flavour, 20, "")+";\n" return result # The C API implementation (.cc) @@ -200,7 +200,7 @@ def clblast_c_cc(routines): for flavour in routine.flavours: template = "<"+flavour.template+">" if routine.NoScalars() else "" indent = " "*(26 + routine.Length() + len(template)) - result += routine.RoutineHeaderC(flavour, 20)+" {\n" + result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+" {\n" result += " auto status = clblast::"+routine.name.capitalize()+template+"(" result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)]) result += ",\n"+indent+"queue, event);" @@ -247,7 +247,7 @@ files = [ path_clblast+"/src/clblast_c.cc", path_clblast+"/test/wrapper_clblas.h", ] -header_lines = [84, 63, 80, 24, 22] +header_lines = [84, 64, 80, 25, 22] footer_lines = [6, 3, 5, 2, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index df4dd019..60b9fcc5 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -326,9 +326,9 @@ class Routine(): return result # As above, but now for C - def RoutineHeaderC(self, flavour, spaces): + def RoutineHeaderC(self, flavour, spaces, extra_qualifier): indent = " "*(spaces + self.Length()) - result = "StatusCode CLBlast"+flavour.name+self.name+"(" + result = "StatusCode"+extra_qualifier+" CLBlast"+flavour.name+self.name+"(" result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)]) result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" return result diff --git a/src/clblast.cc b/src/clblast.cc index 3695aa02..e7f2477f 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -16,6 +16,7 @@ #include #include "clblast.h" +#include "internal/public_api.h" // BLAS level-1 includes #include "internal/routines/level1/xswap.h" @@ -81,22 +82,22 @@ StatusCode Swap(const size_t n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Swap(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Swap(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Swap(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Swap(const size_t, - cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Swap(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL template @@ -113,22 +114,22 @@ StatusCode Scal(const size_t n, alpha, Buffer(x_buffer), x_offset, x_inc); } -template StatusCode Scal(const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Scal(const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Scal(const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Scal(const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Scal(const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Scal(const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Scal(const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Scal(const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY template @@ -145,22 +146,22 @@ StatusCode Copy(const size_t n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Copy(const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Copy(const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Copy(const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Copy(const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Copy(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY template @@ -179,26 +180,26 @@ StatusCode Axpy(const size_t n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Axpy(const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Axpy(const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Axpy(const size_t, - const float2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Axpy(const size_t, - const double2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Axpy(const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Axpy(const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Axpy(const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Axpy(const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Dot product of two vectors: SDOT/DDOT template @@ -217,16 +218,16 @@ StatusCode Dot(const size_t n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Dot(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Dot(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dot(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dot(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Dot product of two complex vectors: CDOTU/ZDOTU template @@ -245,16 +246,16 @@ StatusCode Dotu(const size_t n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Dotu(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Dotu(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dotu(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dotu(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template @@ -273,16 +274,16 @@ StatusCode Dotc(const size_t n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Dotc(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Dotc(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dotc(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dotc(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -311,38 +312,38 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, beta, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Gemv(const Layout, const Transpose, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV template @@ -367,38 +368,38 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, beta, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Gbmv(const Layout, const Transpose, - const size_t, const size_t, const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, + const size_t, const size_t, const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template @@ -423,22 +424,22 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, beta, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Hemv(const Layout, const Triangle, - const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Hemv(const Layout, const Triangle, - const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template @@ -463,22 +464,22 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, beta, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Hbmv(const Layout, const Triangle, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Hbmv(const Layout, const Triangle, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template @@ -503,22 +504,22 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, beta, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Hpmv(const Layout, const Triangle, - const size_t, - const float2, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Hpmv(const Layout, const Triangle, - const size_t, - const double2, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Symmetric matrix-vector multiplication: SSYMV/DSYMV template @@ -543,22 +544,22 @@ StatusCode Symv(const Layout layout, const Triangle triangle, beta, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Symv(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Symv(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symv(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV template @@ -583,22 +584,22 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, beta, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Sbmv(const Layout, const Triangle, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Sbmv(const Layout, const Triangle, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV template @@ -623,22 +624,22 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, beta, Buffer(y_buffer), y_offset, y_inc); } -template StatusCode Spmv(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Spmv(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV template @@ -657,26 +658,26 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); } -template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV template @@ -695,26 +696,26 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); } -template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV template @@ -733,26 +734,26 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc); } -template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template @@ -763,26 +764,26 @@ StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } -template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template @@ -793,26 +794,26 @@ StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } -template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template @@ -823,26 +824,26 @@ StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } -template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, + const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // General rank-1 matrix update: SGER/DGER template @@ -865,20 +866,20 @@ StatusCode Ger(const Layout layout, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); } -template StatusCode Ger(const Layout, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Ger(const Layout, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Ger(const Layout, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // General rank-1 complex matrix update: CGERU/ZGERU template @@ -901,20 +902,20 @@ StatusCode Geru(const Layout layout, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); } -template StatusCode Geru(const Layout, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Geru(const Layout, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Geru(const Layout, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Geru(const Layout, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template @@ -937,20 +938,20 @@ StatusCode Gerc(const Layout layout, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); } -template StatusCode Gerc(const Layout, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Gerc(const Layout, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gerc(const Layout, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gerc(const Layout, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Hermitian rank-1 matrix update: CHER/ZHER template @@ -971,18 +972,18 @@ StatusCode Her(const Layout layout, const Triangle triangle, Buffer>(x_buffer), x_offset, x_inc, Buffer>(a_buffer), a_offset, a_ld); } -template StatusCode Her(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Her(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Her(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Her(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template @@ -1003,18 +1004,18 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, Buffer>(x_buffer), x_offset, x_inc, Buffer>(ap_buffer), ap_offset); } -template StatusCode Hpr(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Hpr(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); // Hermitian rank-2 matrix update: CHER2/ZHER2 template @@ -1037,20 +1038,20 @@ StatusCode Her2(const Layout layout, const Triangle triangle, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); } -template StatusCode Her2(const Layout, const Triangle, - const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Her2(const Layout, const Triangle, - const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Her2(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Her2(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template @@ -1073,20 +1074,20 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, Buffer(y_buffer), y_offset, y_inc, Buffer(ap_buffer), ap_offset); } -template StatusCode Hpr2(const Layout, const Triangle, - const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Hpr2(const Layout, const Triangle, - const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, + const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, + const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); // Symmetric rank-1 matrix update: SSYR/DSYR template @@ -1107,18 +1108,18 @@ StatusCode Syr(const Layout layout, const Triangle triangle, Buffer(x_buffer), x_offset, x_inc, Buffer(a_buffer), a_offset, a_ld); } -template StatusCode Syr(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Syr(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Symmetric packed rank-1 matrix update: SSPR/DSPR template @@ -1139,18 +1140,18 @@ StatusCode Spr(const Layout layout, const Triangle triangle, Buffer(x_buffer), x_offset, x_inc, Buffer(ap_buffer), ap_offset); } -template StatusCode Spr(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Spr(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); // Symmetric rank-2 matrix update: SSYR2/DSYR2 template @@ -1173,20 +1174,20 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); } -template StatusCode Syr2(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Syr2(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2 template @@ -1209,20 +1210,20 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, Buffer(y_buffer), y_offset, y_inc, Buffer(ap_buffer), ap_offset); } -template StatusCode Spr2(const Layout, const Triangle, - const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Spr2(const Layout, const Triangle, - const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, + const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -1251,38 +1252,38 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos beta, Buffer(c_buffer), c_offset, c_ld); } -template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Gemm(const Layout, const Transpose, const Transpose, - const size_t, const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, + const size_t, const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM template @@ -1307,38 +1308,38 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, beta, Buffer(c_buffer), c_offset, c_ld); } -template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Symm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template @@ -1363,22 +1364,22 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, beta, Buffer(c_buffer), c_offset, c_ld); } -template StatusCode Hemm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Hemm(const Layout, const Side, const Triangle, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK template @@ -1401,34 +1402,34 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ beta, Buffer(c_buffer), c_offset, c_ld); } -template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Syrk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Rank-K update of a hermitian matrix: CHERK/ZHERK template @@ -1451,20 +1452,20 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ beta, Buffer>(c_buffer), c_offset, c_ld); } -template StatusCode Herk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Herk(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K template @@ -1489,38 +1490,38 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a beta, Buffer(c_buffer), c_offset, c_ld); } -template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Syr2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double2, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template @@ -1545,22 +1546,22 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a beta, Buffer(c_buffer), c_offset, c_ld); } -template StatusCode Her2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const float, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Her2k(const Layout, const Triangle, const Transpose, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - const cl_mem, const size_t, const size_t, - const double, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM template @@ -1581,30 +1582,30 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); } -template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template @@ -1616,30 +1617,30 @@ StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } -template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const float, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const double, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const float2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); -template StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const double2, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const float2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, + const size_t, const size_t, + const double2, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= } // namespace clblast diff --git a/src/clblast_c.cc b/src/clblast_c.cc index fcec0951..3b07ebc9 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -17,6 +17,7 @@ extern "C" { #include "clblast_c.h" } #include "clblast.h" +#include "internal/public_api.h" #include "internal/utilities.h" // Shortcuts to the clblast namespace @@ -28,40 +29,40 @@ using double2 = clblast::double2; // ================================================================================================= // SWAP -StatusCode CLBlastSswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode CLBlastDswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode CLBlastCswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode CLBlastZswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, @@ -70,40 +71,40 @@ StatusCode CLBlastZswap(const size_t n, } // SCAL -StatusCode CLBlastSscal(const size_t n, - const float alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSscal(const size_t n, + const float alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Scal(n, alpha, x_buffer, x_offset, x_inc, queue, event); return static_cast(status); } -StatusCode CLBlastDscal(const size_t n, - const double alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDscal(const size_t n, + const double alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Scal(n, alpha, x_buffer, x_offset, x_inc, queue, event); return static_cast(status); } -StatusCode CLBlastCscal(const size_t n, - const cl_float2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCscal(const size_t n, + const cl_float2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Scal(n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, queue, event); return static_cast(status); } -StatusCode CLBlastZscal(const size_t n, - const cl_double2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZscal(const size_t n, + const cl_double2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Scal(n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, @@ -112,40 +113,40 @@ StatusCode CLBlastZscal(const size_t n, } // COPY -StatusCode CLBlastScopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastScopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode CLBlastDcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode CLBlastCcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode CLBlastZcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, @@ -154,11 +155,11 @@ StatusCode CLBlastZcopy(const size_t n, } // AXPY -StatusCode CLBlastSaxpy(const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSaxpy(const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Axpy(n, alpha, x_buffer, x_offset, x_inc, @@ -166,11 +167,11 @@ StatusCode CLBlastSaxpy(const size_t n, queue, event); return static_cast(status); } -StatusCode CLBlastDaxpy(const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDaxpy(const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Axpy(n, alpha, x_buffer, x_offset, x_inc, @@ -178,11 +179,11 @@ StatusCode CLBlastDaxpy(const size_t n, queue, event); return static_cast(status); } -StatusCode CLBlastCaxpy(const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCaxpy(const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Axpy(n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, @@ -190,11 +191,11 @@ StatusCode CLBlastCaxpy(const size_t n, queue, event); return static_cast(status); } -StatusCode CLBlastZaxpy(const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZaxpy(const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Axpy(n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, @@ -204,11 +205,11 @@ StatusCode CLBlastZaxpy(const size_t n, } // DOT -StatusCode CLBlastSdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -216,11 +217,11 @@ StatusCode CLBlastSdot(const size_t n, queue, event); return static_cast(status); } -StatusCode CLBlastDdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -230,11 +231,11 @@ StatusCode CLBlastDdot(const size_t n, } // DOTU -StatusCode CLBlastCdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -242,11 +243,11 @@ StatusCode CLBlastCdotu(const size_t n, queue, event); return static_cast(status); } -StatusCode CLBlastZdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -256,11 +257,11 @@ StatusCode CLBlastZdotu(const size_t n, } // DOTC -StatusCode CLBlastCdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -268,11 +269,11 @@ StatusCode CLBlastCdotc(const size_t n, queue, event); return static_cast(status); } -StatusCode CLBlastZdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -286,14 +287,14 @@ StatusCode CLBlastZdotc(const size_t n, // ================================================================================================= // GEMV -StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, @@ -305,14 +306,14 @@ StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, queue, event); return static_cast(status); } -StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, @@ -324,14 +325,14 @@ StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, queue, event); return static_cast(status); } -StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, @@ -343,14 +344,14 @@ StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, queue, event); return static_cast(status); } -StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, @@ -364,14 +365,14 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, } // GBMV -StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, @@ -383,14 +384,14 @@ StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, queue, event); return static_cast(status); } -StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, @@ -402,14 +403,14 @@ StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, queue, event); return static_cast(status); } -StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, @@ -421,14 +422,14 @@ StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, queue, event); return static_cast(status); } -StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, @@ -442,14 +443,14 @@ StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, } // HEMV -StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastChemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hemv(static_cast(layout), static_cast(triangle), n, @@ -461,14 +462,14 @@ StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZhemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hemv(static_cast(layout), static_cast(triangle), n, @@ -482,14 +483,14 @@ StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, } // HBMV -StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hbmv(static_cast(layout), static_cast(triangle), n, k, @@ -501,14 +502,14 @@ StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hbmv(static_cast(layout), static_cast(triangle), n, k, @@ -522,14 +523,14 @@ StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, } // HPMV -StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpmv(static_cast(layout), static_cast(triangle), n, @@ -541,14 +542,14 @@ StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpmv(static_cast(layout), static_cast(triangle), n, @@ -562,14 +563,14 @@ StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, } // SYMV -StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSsymv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symv(static_cast(layout), static_cast(triangle), n, @@ -581,14 +582,14 @@ StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDsymv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symv(static_cast(layout), static_cast(triangle), n, @@ -602,14 +603,14 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, } // SBMV -StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Sbmv(static_cast(layout), static_cast(triangle), n, k, @@ -621,14 +622,14 @@ StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Sbmv(static_cast(layout), static_cast(triangle), n, k, @@ -642,14 +643,14 @@ StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, } // SPMV -StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spmv(static_cast(layout), static_cast(triangle), n, @@ -661,14 +662,14 @@ StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spmv(static_cast(layout), static_cast(triangle), n, @@ -682,11 +683,11 @@ StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, } // TRMV -StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -697,11 +698,11 @@ StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -712,11 +713,11 @@ StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -727,11 +728,11 @@ StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -744,11 +745,11 @@ StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Tran } // TBMV -StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -759,11 +760,11 @@ StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -774,11 +775,11 @@ StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -789,11 +790,11 @@ StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -806,11 +807,11 @@ StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Tran } // TPMV -StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -821,11 +822,11 @@ StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -836,11 +837,11 @@ StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -851,11 +852,11 @@ StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -868,11 +869,11 @@ StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Tran } // TRSV -StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -883,11 +884,11 @@ StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -898,11 +899,11 @@ StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -913,11 +914,11 @@ StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -930,11 +931,11 @@ StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Tran } // TBSV -StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -945,11 +946,11 @@ StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -960,11 +961,11 @@ StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -975,11 +976,11 @@ StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -992,11 +993,11 @@ StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Tran } // TPSV -StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1007,11 +1008,11 @@ StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1022,11 +1023,11 @@ StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1037,11 +1038,11 @@ StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1054,13 +1055,13 @@ StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Tran } // GER -StatusCode CLBlastSger(const Layout layout, - const size_t m, const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Ger(static_cast(layout), m, n, alpha, @@ -1070,13 +1071,13 @@ StatusCode CLBlastSger(const Layout layout, queue, event); return static_cast(status); } -StatusCode CLBlastDger(const Layout layout, - const size_t m, const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Ger(static_cast(layout), m, n, alpha, @@ -1088,13 +1089,13 @@ StatusCode CLBlastDger(const Layout layout, } // GERU -StatusCode CLBlastCgeru(const Layout layout, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Geru(static_cast(layout), m, n, float2{alpha.s[0], alpha.s[1]}, @@ -1104,13 +1105,13 @@ StatusCode CLBlastCgeru(const Layout layout, queue, event); return static_cast(status); } -StatusCode CLBlastZgeru(const Layout layout, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Geru(static_cast(layout), m, n, double2{alpha.s[0], alpha.s[1]}, @@ -1122,13 +1123,13 @@ StatusCode CLBlastZgeru(const Layout layout, } // GERC -StatusCode CLBlastCgerc(const Layout layout, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gerc(static_cast(layout), m, n, float2{alpha.s[0], alpha.s[1]}, @@ -1138,13 +1139,13 @@ StatusCode CLBlastCgerc(const Layout layout, queue, event); return static_cast(status); } -StatusCode CLBlastZgerc(const Layout layout, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gerc(static_cast(layout), m, n, double2{alpha.s[0], alpha.s[1]}, @@ -1156,12 +1157,12 @@ StatusCode CLBlastZgerc(const Layout layout, } // HER -StatusCode CLBlastCher(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her(static_cast(layout), static_cast(triangle), n, @@ -1171,12 +1172,12 @@ StatusCode CLBlastCher(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastZher(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her(static_cast(layout), static_cast(triangle), n, @@ -1188,12 +1189,12 @@ StatusCode CLBlastZher(const Layout layout, const Triangle triangle, } // HPR -StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpr(static_cast(layout), static_cast(triangle), n, @@ -1203,12 +1204,12 @@ StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpr(static_cast(layout), static_cast(triangle), n, @@ -1220,13 +1221,13 @@ StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, } // HER2 -StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her2(static_cast(layout), static_cast(triangle), n, @@ -1237,13 +1238,13 @@ StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her2(static_cast(layout), static_cast(triangle), n, @@ -1256,13 +1257,13 @@ StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, } // HPR2 -StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpr2(static_cast(layout), static_cast(triangle), n, @@ -1273,13 +1274,13 @@ StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpr2(static_cast(layout), static_cast(triangle), n, @@ -1292,12 +1293,12 @@ StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, } // SYR -StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr(static_cast(layout), static_cast(triangle), n, @@ -1307,12 +1308,12 @@ StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr(static_cast(layout), static_cast(triangle), n, @@ -1324,12 +1325,12 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, } // SPR -StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spr(static_cast(layout), static_cast(triangle), n, @@ -1339,12 +1340,12 @@ StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spr(static_cast(layout), static_cast(triangle), n, @@ -1356,13 +1357,13 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, } // SYR2 -StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2(static_cast(layout), static_cast(triangle), n, @@ -1373,13 +1374,13 @@ StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2(static_cast(layout), static_cast(triangle), n, @@ -1392,13 +1393,13 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, } // SPR2 -StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spr2(static_cast(layout), static_cast(triangle), n, @@ -1409,13 +1410,13 @@ StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spr2(static_cast(layout), static_cast(triangle), n, @@ -1432,14 +1433,14 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, // ================================================================================================= // GEMM -StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), @@ -1452,14 +1453,14 @@ StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const queue, event); return static_cast(status); } -StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), @@ -1472,14 +1473,14 @@ StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const queue, event); return static_cast(status); } -StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), @@ -1492,14 +1493,14 @@ StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const queue, event); return static_cast(status); } -StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), @@ -1514,14 +1515,14 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const } // SYMM -StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1534,14 +1535,14 @@ StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } -StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1554,14 +1555,14 @@ StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } -StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1574,14 +1575,14 @@ StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } -StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1596,14 +1597,14 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri } // HEMM -StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hemm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1616,14 +1617,14 @@ StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } -StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hemm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1638,13 +1639,13 @@ StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle tri } // SYRK -StatusCode CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1656,13 +1657,13 @@ StatusCode CLBlastSsyrk(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1674,13 +1675,13 @@ StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1692,13 +1693,13 @@ StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1712,13 +1713,13 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran } // HERK -StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Herk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1730,13 +1731,13 @@ StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Tran queue, event); return static_cast(status); } -StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Herk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1750,14 +1751,14 @@ StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Tran } // SYR2K -StatusCode CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1770,14 +1771,14 @@ StatusCode CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Tra queue, event); return static_cast(status); } -StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1790,14 +1791,14 @@ StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Tra queue, event); return static_cast(status); } -StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1810,14 +1811,14 @@ StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Tra queue, event); return static_cast(status); } -StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1832,14 +1833,14 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra } // HER2K -StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1852,14 +1853,14 @@ StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Tra queue, event); return static_cast(status); } -StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1874,12 +1875,12 @@ StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Tra } // TRMM -StatusCode CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1892,12 +1893,12 @@ StatusCode CLBlastStrmm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } -StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1910,12 +1911,12 @@ StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } -StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1928,12 +1929,12 @@ StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } -StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1948,12 +1949,12 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri } // TRSM -StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1966,12 +1967,12 @@ StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } -StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1984,12 +1985,12 @@ StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } -StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -2002,12 +2003,12 @@ StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle tri queue, event); return static_cast(status); } -StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), From 706c6987c6044d49770874e5752968a3384a5300 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 23 Mar 2016 20:31:25 +0100 Subject: [PATCH 04/60] Fixed compilation of the two SGEMM samples --- CMakeLists.txt | 8 ++++++++ samples/sgemm.c | 1 + samples/sgemm.cc | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d26a2843..65debdf4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,6 +73,14 @@ else () endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") +# C compiler settings (for the sample) +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + set(CFLAGS "/Ox") +else () + set(CFLAGS "-O3 -std=c99") +endif() +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}") + # ================================================================================================== # Package scripts location diff --git a/samples/sgemm.c b/samples/sgemm.c index f43fb147..d528db0a 100644 --- a/samples/sgemm.c +++ b/samples/sgemm.c @@ -15,6 +15,7 @@ // // ================================================================================================= +#include #include #include diff --git a/samples/sgemm.cc b/samples/sgemm.cc index f4015278..8f33b6ad 100644 --- a/samples/sgemm.cc +++ b/samples/sgemm.cc @@ -22,7 +22,7 @@ // Includes the C++ OpenCL API. If not yet available, it can be found here: // https://www.khronos.org/registry/cl/api/1.1/cl.hpp -#include +#include // Includes the CLBlast library #include From 49822c8ead3313e88a08f31162870e88f8ad2bb5 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 23 Mar 2016 20:49:28 +0100 Subject: [PATCH 05/60] Fixed the C-api export to be able to properly build a DLL on Windows --- CHANGELOG | 1 + include/clblast_c.h | 1501 ++++++++++++++++---------------- include/internal/public_api.h | 2 + scripts/generator/generator.py | 6 +- src/clblast_c.cc | 1489 ++++++++++++++++--------------- 5 files changed, 1508 insertions(+), 1491 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 1606168c..f45a35dd 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,6 @@ Development version (next release) +- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) - Made the library thread-safe Version 0.6.0 diff --git a/include/clblast_c.h b/include/clblast_c.h index fac39a58..c5395e51 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -22,6 +22,19 @@ #include #endif +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#ifdef _WIN32 + #define PUBLIC_API __declspec(dllexport) +#else + #define PUBLIC_API +#endif + +// The C interface +#ifdef __cplusplus +extern "C" { +#endif + // ================================================================================================= // Status codes. These codes can be returned by functions declared in this header file. The error @@ -84,838 +97,842 @@ typedef enum Precision_ { kHalf = 16, kSingle = 32, kDouble = 64, // ================================================================================================= // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP -StatusCode CLBlastSswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL -StatusCode CLBlastSscal(const size_t n, - const float alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDscal(const size_t n, - const double alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCscal(const size_t n, - const cl_float2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZscal(const size_t n, - const cl_double2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSscal(const size_t n, + const float alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDscal(const size_t n, + const double alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCscal(const size_t n, + const cl_float2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZscal(const size_t n, + const cl_double2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY -StatusCode CLBlastScopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastScopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY -StatusCode CLBlastSaxpy(const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDaxpy(const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCaxpy(const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZaxpy(const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSaxpy(const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDaxpy(const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCaxpy(const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZaxpy(const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Dot product of two vectors: SDOT/DDOT -StatusCode CLBlastSdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Dot product of two complex vectors: CDOTU/ZDOTU -StatusCode CLBlastCdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -StatusCode CLBlastCdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV -StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV -StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV -StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastChemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZhemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV -StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV -StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Symmetric matrix-vector multiplication: SSYMV/DSYMV -StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSsymv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDsymv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV -StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV -StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV -StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV -StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV -StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV -StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV -StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV -StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // General rank-1 matrix update: SGER/DGER -StatusCode CLBlastSger(const Layout layout, - const size_t m, const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDger(const Layout layout, - const size_t m, const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); // General rank-1 complex matrix update: CGERU/ZGERU -StatusCode CLBlastCgeru(const Layout layout, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZgeru(const Layout layout, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); // General rank-1 complex conjugated matrix update: CGERC/ZGERC -StatusCode CLBlastCgerc(const Layout layout, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZgerc(const Layout layout, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); // Hermitian rank-1 matrix update: CHER/ZHER -StatusCode CLBlastCher(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZher(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); // Hermitian packed rank-1 matrix update: CHPR/ZHPR -StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); // Hermitian rank-2 matrix update: CHER2/ZHER2 -StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 -StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); // Symmetric rank-1 matrix update: SSYR/DSYR -StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); // Symmetric packed rank-1 matrix update: SSPR/DSPR -StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); // Symmetric rank-2 matrix update: SSYR2/DSYR2 -StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2 -StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM -StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM -StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM -StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK -StatusCode CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); // Rank-K update of a hermitian matrix: CHERK/ZHERK -StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K -StatusCode CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K -StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM -StatusCode CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM -StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); -StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event); // ================================================================================================= +#ifdef __cplusplus +} // extern "C" +#endif + // CLBLAST_CLBLAST_C_H_ #endif diff --git a/include/internal/public_api.h b/include/internal/public_api.h index ed3f1e29..08a55c6a 100644 --- a/include/internal/public_api.h +++ b/include/internal/public_api.h @@ -8,6 +8,8 @@ // Cedric Nugteren // // This file provides macro's to define the public API. This is needed when building a Windows DLL. +// Note: this is only used for the C++ interface, the C interface has its own definition included in +// the header file itself. // // ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 867fe520..8ff5e130 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -189,7 +189,7 @@ def clblast_c_h(routines): for routine in routines: result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" for flavour in routine.flavours: - result += routine.RoutineHeaderC(flavour, 20, "")+";\n" + result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n" return result # The C API implementation (.cc) @@ -200,7 +200,7 @@ def clblast_c_cc(routines): for flavour in routine.flavours: template = "<"+flavour.template+">" if routine.NoScalars() else "" indent = " "*(26 + routine.Length() + len(template)) - result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+" {\n" + result += routine.RoutineHeaderC(flavour, 20, "")+" {\n" result += " auto status = clblast::"+routine.name.capitalize()+template+"(" result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)]) result += ",\n"+indent+"queue, event);" @@ -247,7 +247,7 @@ files = [ path_clblast+"/src/clblast_c.cc", path_clblast+"/test/wrapper_clblas.h", ] -header_lines = [84, 64, 80, 25, 22] +header_lines = [84, 64, 88, 24, 22] footer_lines = [6, 3, 5, 2, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 3b07ebc9..66d16f6d 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -13,11 +13,8 @@ #include -extern "C" { - #include "clblast_c.h" -} +#include "clblast_c.h" #include "clblast.h" -#include "internal/public_api.h" #include "internal/utilities.h" // Shortcuts to the clblast namespace @@ -29,40 +26,40 @@ using double2 = clblast::double2; // ================================================================================================= // SWAP -StatusCode PUBLIC_API CLBlastSswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Swap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, @@ -71,40 +68,40 @@ StatusCode PUBLIC_API CLBlastZswap(const size_t n, } // SCAL -StatusCode PUBLIC_API CLBlastSscal(const size_t n, - const float alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSscal(const size_t n, + const float alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Scal(n, alpha, x_buffer, x_offset, x_inc, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDscal(const size_t n, - const double alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDscal(const size_t n, + const double alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Scal(n, alpha, x_buffer, x_offset, x_inc, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCscal(const size_t n, - const cl_float2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCscal(const size_t n, + const cl_float2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Scal(n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZscal(const size_t n, - const cl_double2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZscal(const size_t n, + const cl_double2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Scal(n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, @@ -113,40 +110,40 @@ StatusCode PUBLIC_API CLBlastZscal(const size_t n, } // COPY -StatusCode PUBLIC_API CLBlastScopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastScopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Copy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, @@ -155,11 +152,11 @@ StatusCode PUBLIC_API CLBlastZcopy(const size_t n, } // AXPY -StatusCode PUBLIC_API CLBlastSaxpy(const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSaxpy(const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Axpy(n, alpha, x_buffer, x_offset, x_inc, @@ -167,11 +164,11 @@ StatusCode PUBLIC_API CLBlastSaxpy(const size_t n, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDaxpy(const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDaxpy(const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Axpy(n, alpha, x_buffer, x_offset, x_inc, @@ -179,11 +176,11 @@ StatusCode PUBLIC_API CLBlastDaxpy(const size_t n, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCaxpy(const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCaxpy(const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Axpy(n, float2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, @@ -191,11 +188,11 @@ StatusCode PUBLIC_API CLBlastCaxpy(const size_t n, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZaxpy(const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZaxpy(const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Axpy(n, double2{alpha.s[0], alpha.s[1]}, x_buffer, x_offset, x_inc, @@ -205,11 +202,11 @@ StatusCode PUBLIC_API CLBlastZaxpy(const size_t n, } // DOT -StatusCode PUBLIC_API CLBlastSdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -217,11 +214,11 @@ StatusCode PUBLIC_API CLBlastSdot(const size_t n, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -231,11 +228,11 @@ StatusCode PUBLIC_API CLBlastDdot(const size_t n, } // DOTU -StatusCode PUBLIC_API CLBlastCdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -243,11 +240,11 @@ StatusCode PUBLIC_API CLBlastCdotu(const size_t n, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -257,11 +254,11 @@ StatusCode PUBLIC_API CLBlastZdotu(const size_t n, } // DOTC -StatusCode PUBLIC_API CLBlastCdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -269,11 +266,11 @@ StatusCode PUBLIC_API CLBlastCdotc(const size_t n, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Dotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, @@ -287,14 +284,14 @@ StatusCode PUBLIC_API CLBlastZdotc(const size_t n, // ================================================================================================= // GEMV -StatusCode PUBLIC_API CLBlastSgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, @@ -306,14 +303,14 @@ StatusCode PUBLIC_API CLBlastSgemv(const Layout layout, const Transpose a_transp queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, @@ -325,14 +322,14 @@ StatusCode PUBLIC_API CLBlastDgemv(const Layout layout, const Transpose a_transp queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, @@ -344,14 +341,14 @@ StatusCode PUBLIC_API CLBlastCgemv(const Layout layout, const Transpose a_transp queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZgemv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemv(static_cast(layout), static_cast(a_transpose), m, n, @@ -365,14 +362,14 @@ StatusCode PUBLIC_API CLBlastZgemv(const Layout layout, const Transpose a_transp } // GBMV -StatusCode PUBLIC_API CLBlastSgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, @@ -384,14 +381,14 @@ StatusCode PUBLIC_API CLBlastSgbmv(const Layout layout, const Transpose a_transp queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, @@ -403,14 +400,14 @@ StatusCode PUBLIC_API CLBlastDgbmv(const Layout layout, const Transpose a_transp queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, @@ -422,14 +419,14 @@ StatusCode PUBLIC_API CLBlastCgbmv(const Layout layout, const Transpose a_transp queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZgbmv(const Layout layout, const Transpose a_transpose, - const size_t m, const size_t n, const size_t kl, const size_t ku, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gbmv(static_cast(layout), static_cast(a_transpose), m, n, kl, ku, @@ -443,14 +440,14 @@ StatusCode PUBLIC_API CLBlastZgbmv(const Layout layout, const Transpose a_transp } // HEMV -StatusCode PUBLIC_API CLBlastChemv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hemv(static_cast(layout), static_cast(triangle), n, @@ -462,14 +459,14 @@ StatusCode PUBLIC_API CLBlastChemv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZhemv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hemv(static_cast(layout), static_cast(triangle), n, @@ -483,14 +480,14 @@ StatusCode PUBLIC_API CLBlastZhemv(const Layout layout, const Triangle triangle, } // HBMV -StatusCode PUBLIC_API CLBlastChbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hbmv(static_cast(layout), static_cast(triangle), n, k, @@ -502,14 +499,14 @@ StatusCode PUBLIC_API CLBlastChbmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZhbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hbmv(static_cast(layout), static_cast(triangle), n, k, @@ -523,14 +520,14 @@ StatusCode PUBLIC_API CLBlastZhbmv(const Layout layout, const Triangle triangle, } // HPMV -StatusCode PUBLIC_API CLBlastChpmv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpmv(static_cast(layout), static_cast(triangle), n, @@ -542,14 +539,14 @@ StatusCode PUBLIC_API CLBlastChpmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZhpmv(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpmv(static_cast(layout), static_cast(triangle), n, @@ -563,14 +560,14 @@ StatusCode PUBLIC_API CLBlastZhpmv(const Layout layout, const Triangle triangle, } // SYMV -StatusCode PUBLIC_API CLBlastSsymv(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symv(static_cast(layout), static_cast(triangle), n, @@ -582,14 +579,14 @@ StatusCode PUBLIC_API CLBlastSsymv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDsymv(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symv(static_cast(layout), static_cast(triangle), n, @@ -603,14 +600,14 @@ StatusCode PUBLIC_API CLBlastDsymv(const Layout layout, const Triangle triangle, } // SBMV -StatusCode PUBLIC_API CLBlastSsbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Sbmv(static_cast(layout), static_cast(triangle), n, k, @@ -622,14 +619,14 @@ StatusCode PUBLIC_API CLBlastSsbmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDsbmv(const Layout layout, const Triangle triangle, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Sbmv(static_cast(layout), static_cast(triangle), n, k, @@ -643,14 +640,14 @@ StatusCode PUBLIC_API CLBlastDsbmv(const Layout layout, const Triangle triangle, } // SPMV -StatusCode PUBLIC_API CLBlastSspmv(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spmv(static_cast(layout), static_cast(triangle), n, @@ -662,14 +659,14 @@ StatusCode PUBLIC_API CLBlastSspmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDspmv(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spmv(static_cast(layout), static_cast(triangle), n, @@ -683,11 +680,11 @@ StatusCode PUBLIC_API CLBlastDspmv(const Layout layout, const Triangle triangle, } // TRMV -StatusCode PUBLIC_API CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -698,11 +695,11 @@ StatusCode PUBLIC_API CLBlastStrmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -713,11 +710,11 @@ StatusCode PUBLIC_API CLBlastDtrmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -728,11 +725,11 @@ StatusCode PUBLIC_API CLBlastCtrmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -745,11 +742,11 @@ StatusCode PUBLIC_API CLBlastZtrmv(const Layout layout, const Triangle triangle, } // TBMV -StatusCode PUBLIC_API CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -760,11 +757,11 @@ StatusCode PUBLIC_API CLBlastStbmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -775,11 +772,11 @@ StatusCode PUBLIC_API CLBlastDtbmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -790,11 +787,11 @@ StatusCode PUBLIC_API CLBlastCtbmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -807,11 +804,11 @@ StatusCode PUBLIC_API CLBlastZtbmv(const Layout layout, const Triangle triangle, } // TPMV -StatusCode PUBLIC_API CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -822,11 +819,11 @@ StatusCode PUBLIC_API CLBlastStpmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -837,11 +834,11 @@ StatusCode PUBLIC_API CLBlastDtpmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -852,11 +849,11 @@ StatusCode PUBLIC_API CLBlastCtpmv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpmv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -869,11 +866,11 @@ StatusCode PUBLIC_API CLBlastZtpmv(const Layout layout, const Triangle triangle, } // TRSV -StatusCode PUBLIC_API CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -884,11 +881,11 @@ StatusCode PUBLIC_API CLBlastStrsv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -899,11 +896,11 @@ StatusCode PUBLIC_API CLBlastDtrsv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -914,11 +911,11 @@ StatusCode PUBLIC_API CLBlastCtrsv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -931,11 +928,11 @@ StatusCode PUBLIC_API CLBlastZtrsv(const Layout layout, const Triangle triangle, } // TBSV -StatusCode PUBLIC_API CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastStbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -946,11 +943,11 @@ StatusCode PUBLIC_API CLBlastStbsv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -961,11 +958,11 @@ StatusCode PUBLIC_API CLBlastDtbsv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -976,11 +973,11 @@ StatusCode PUBLIC_API CLBlastCtbsv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tbsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -993,11 +990,11 @@ StatusCode PUBLIC_API CLBlastZtbsv(const Layout layout, const Triangle triangle, } // TPSV -StatusCode PUBLIC_API CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastStpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1008,11 +1005,11 @@ StatusCode PUBLIC_API CLBlastStpsv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1023,11 +1020,11 @@ StatusCode PUBLIC_API CLBlastDtpsv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1038,11 +1035,11 @@ StatusCode PUBLIC_API CLBlastCtpsv(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Tpsv(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1055,13 +1052,13 @@ StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle, } // GER -StatusCode PUBLIC_API CLBlastSger(const Layout layout, - const size_t m, const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Ger(static_cast(layout), m, n, alpha, @@ -1071,13 +1068,13 @@ StatusCode PUBLIC_API CLBlastSger(const Layout layout, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDger(const Layout layout, - const size_t m, const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Ger(static_cast(layout), m, n, alpha, @@ -1089,13 +1086,13 @@ StatusCode PUBLIC_API CLBlastDger(const Layout layout, } // GERU -StatusCode PUBLIC_API CLBlastCgeru(const Layout layout, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Geru(static_cast(layout), m, n, float2{alpha.s[0], alpha.s[1]}, @@ -1105,13 +1102,13 @@ StatusCode PUBLIC_API CLBlastCgeru(const Layout layout, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZgeru(const Layout layout, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Geru(static_cast(layout), m, n, double2{alpha.s[0], alpha.s[1]}, @@ -1123,13 +1120,13 @@ StatusCode PUBLIC_API CLBlastZgeru(const Layout layout, } // GERC -StatusCode PUBLIC_API CLBlastCgerc(const Layout layout, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gerc(static_cast(layout), m, n, float2{alpha.s[0], alpha.s[1]}, @@ -1139,13 +1136,13 @@ StatusCode PUBLIC_API CLBlastCgerc(const Layout layout, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZgerc(const Layout layout, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gerc(static_cast(layout), m, n, double2{alpha.s[0], alpha.s[1]}, @@ -1157,12 +1154,12 @@ StatusCode PUBLIC_API CLBlastZgerc(const Layout layout, } // HER -StatusCode PUBLIC_API CLBlastCher(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her(static_cast(layout), static_cast(triangle), n, @@ -1172,12 +1169,12 @@ StatusCode PUBLIC_API CLBlastCher(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZher(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her(static_cast(layout), static_cast(triangle), n, @@ -1189,12 +1186,12 @@ StatusCode PUBLIC_API CLBlastZher(const Layout layout, const Triangle triangle, } // HPR -StatusCode PUBLIC_API CLBlastChpr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpr(static_cast(layout), static_cast(triangle), n, @@ -1204,12 +1201,12 @@ StatusCode PUBLIC_API CLBlastChpr(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZhpr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpr(static_cast(layout), static_cast(triangle), n, @@ -1221,13 +1218,13 @@ StatusCode PUBLIC_API CLBlastZhpr(const Layout layout, const Triangle triangle, } // HER2 -StatusCode PUBLIC_API CLBlastCher2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her2(static_cast(layout), static_cast(triangle), n, @@ -1238,13 +1235,13 @@ StatusCode PUBLIC_API CLBlastCher2(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZher2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her2(static_cast(layout), static_cast(triangle), n, @@ -1257,13 +1254,13 @@ StatusCode PUBLIC_API CLBlastZher2(const Layout layout, const Triangle triangle, } // HPR2 -StatusCode PUBLIC_API CLBlastChpr2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpr2(static_cast(layout), static_cast(triangle), n, @@ -1274,13 +1271,13 @@ StatusCode PUBLIC_API CLBlastChpr2(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle, - const size_t n, - const cl_double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hpr2(static_cast(layout), static_cast(triangle), n, @@ -1293,12 +1290,12 @@ StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle, } // SYR -StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr(static_cast(layout), static_cast(triangle), n, @@ -1308,12 +1305,12 @@ StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr(static_cast(layout), static_cast(triangle), n, @@ -1325,12 +1322,12 @@ StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle, } // SPR -StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spr(static_cast(layout), static_cast(triangle), n, @@ -1340,12 +1337,12 @@ StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spr(static_cast(layout), static_cast(triangle), n, @@ -1357,13 +1354,13 @@ StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle, } // SYR2 -StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2(static_cast(layout), static_cast(triangle), n, @@ -1374,13 +1371,13 @@ StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2(static_cast(layout), static_cast(triangle), n, @@ -1393,13 +1390,13 @@ StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle, } // SPR2 -StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle, - const size_t n, - const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spr2(static_cast(layout), static_cast(triangle), n, @@ -1410,13 +1407,13 @@ StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle, - const size_t n, - const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Spr2(static_cast(layout), static_cast(triangle), n, @@ -1433,14 +1430,14 @@ StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle, // ================================================================================================= // GEMM -StatusCode PUBLIC_API CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), @@ -1453,14 +1450,14 @@ StatusCode PUBLIC_API CLBlastSgemm(const Layout layout, const Transpose a_transp queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), @@ -1473,14 +1470,14 @@ StatusCode PUBLIC_API CLBlastDgemm(const Layout layout, const Transpose a_transp queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), @@ -1493,14 +1490,14 @@ StatusCode PUBLIC_API CLBlastCgemm(const Layout layout, const Transpose a_transp queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const size_t m, const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Gemm(static_cast(layout), static_cast(a_transpose), static_cast(b_transpose), @@ -1515,14 +1512,14 @@ StatusCode PUBLIC_API CLBlastZgemm(const Layout layout, const Transpose a_transp } // SYMM -StatusCode PUBLIC_API CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1535,14 +1532,14 @@ StatusCode PUBLIC_API CLBlastSsymm(const Layout layout, const Side side, const T queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1555,14 +1552,14 @@ StatusCode PUBLIC_API CLBlastDsymm(const Layout layout, const Side side, const T queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1575,14 +1572,14 @@ StatusCode PUBLIC_API CLBlastCsymm(const Layout layout, const Side side, const T queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Symm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1597,14 +1594,14 @@ StatusCode PUBLIC_API CLBlastZsymm(const Layout layout, const Side side, const T } // HEMM -StatusCode PUBLIC_API CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hemm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1617,14 +1614,14 @@ StatusCode PUBLIC_API CLBlastChemm(const Layout layout, const Side side, const T queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Hemm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1639,13 +1636,13 @@ StatusCode PUBLIC_API CLBlastZhemm(const Layout layout, const Side side, const T } // SYRK -StatusCode PUBLIC_API CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1657,13 +1654,13 @@ StatusCode PUBLIC_API CLBlastSsyrk(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1675,13 +1672,13 @@ StatusCode PUBLIC_API CLBlastDsyrk(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1693,13 +1690,13 @@ StatusCode PUBLIC_API CLBlastCsyrk(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syrk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1713,13 +1710,13 @@ StatusCode PUBLIC_API CLBlastZsyrk(const Layout layout, const Triangle triangle, } // HERK -StatusCode PUBLIC_API CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Herk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1731,13 +1728,13 @@ StatusCode PUBLIC_API CLBlastCherk(const Layout layout, const Triangle triangle, queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Herk(static_cast(layout), static_cast(triangle), static_cast(a_transpose), @@ -1751,14 +1748,14 @@ StatusCode PUBLIC_API CLBlastZherk(const Layout layout, const Triangle triangle, } // SYR2K -StatusCode PUBLIC_API CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1771,14 +1768,14 @@ StatusCode PUBLIC_API CLBlastSsyr2k(const Layout layout, const Triangle triangle queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1791,14 +1788,14 @@ StatusCode PUBLIC_API CLBlastDsyr2k(const Layout layout, const Triangle triangle queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1811,14 +1808,14 @@ StatusCode PUBLIC_API CLBlastCsyr2k(const Layout layout, const Triangle triangle queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const cl_double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Syr2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1833,14 +1830,14 @@ StatusCode PUBLIC_API CLBlastZsyr2k(const Layout layout, const Triangle triangle } // HER2K -StatusCode PUBLIC_API CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1853,14 +1850,14 @@ StatusCode PUBLIC_API CLBlastCher2k(const Layout layout, const Triangle triangle queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const size_t n, const size_t k, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Her2k(static_cast(layout), static_cast(triangle), static_cast(ab_transpose), @@ -1875,12 +1872,12 @@ StatusCode PUBLIC_API CLBlastZher2k(const Layout layout, const Triangle triangle } // TRMM -StatusCode PUBLIC_API CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1893,12 +1890,12 @@ StatusCode PUBLIC_API CLBlastStrmm(const Layout layout, const Side side, const T queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1911,12 +1908,12 @@ StatusCode PUBLIC_API CLBlastDtrmm(const Layout layout, const Side side, const T queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1929,12 +1926,12 @@ StatusCode PUBLIC_API CLBlastCtrmm(const Layout layout, const Side side, const T queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trmm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1949,12 +1946,12 @@ StatusCode PUBLIC_API CLBlastZtrmm(const Layout layout, const Side side, const T } // TRSM -StatusCode PUBLIC_API CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1967,12 +1964,12 @@ StatusCode PUBLIC_API CLBlastStrsm(const Layout layout, const Side side, const T queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastDtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -1985,12 +1982,12 @@ StatusCode PUBLIC_API CLBlastDtrsm(const Layout layout, const Side side, const T queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastCtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), @@ -2003,12 +2000,12 @@ StatusCode PUBLIC_API CLBlastCtrsm(const Layout layout, const Side side, const T queue, event); return static_cast(status); } -StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const cl_double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { +StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { auto status = clblast::Trsm(static_cast(layout), static_cast(side), static_cast(triangle), From 3876096c30ad4eed5769dbc88dbfe75b7571718a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 25 Mar 2016 10:00:40 +0100 Subject: [PATCH 06/60] Added prototypes for SNRM2/DNRM2 routines --- include/clblast.h | 7 +++++ include/clblast_c.h | 10 ++++++ scripts/generator/generator.py | 5 +-- scripts/generator/routine.py | 14 ++++----- src/clblast.cc | 17 ++++++++++ src/clblast_c.cc | 22 +++++++++++++ test/correctness/routines/level1/xnrm2.cc | 26 ++++++++++++++++ test/performance/routines/level1/xnrm2.cc | 33 ++++++++++++++++++++ test/wrapper_clblas.h | 38 +++++++++++++++++++++++ 9 files changed, 163 insertions(+), 9 deletions(-) create mode 100644 test/correctness/routines/level1/xnrm2.cc create mode 100644 test/performance/routines/level1/xnrm2.cc diff --git a/include/clblast.h b/include/clblast.h index 70a3b5bc..905de774 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -140,6 +140,13 @@ StatusCode Dotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Euclidian norm of a vector: SNRM2/DNRM2 +template +StatusCode Nrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index c5395e51..1e4be1ab 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -208,6 +208,16 @@ StatusCode PUBLIC_API CLBlastZdotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +// Euclidian norm of a vector: SNRM2/DNRM2 +StatusCode PUBLIC_API CLBlastSnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 8ff5e130..2c22a6fd 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -61,6 +61,7 @@ routines = [ Routine(True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"), Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"), Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"), + Routine(False, "1", "nrm2", T, [S,D], ["n"], [], ["x"], ["nrm2"], [], True, "Euclidian norm of a vector"), ], [ # Level 2: matrix-vector Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"), @@ -247,8 +248,8 @@ files = [ path_clblast+"/src/clblast_c.cc", path_clblast+"/test/wrapper_clblas.h", ] -header_lines = [84, 64, 88, 24, 22] -footer_lines = [6, 3, 5, 2, 6] +header_lines = [84, 64, 93, 22, 22] +footer_lines = [6, 3, 9, 2, 6] # Checks whether the command-line arguments are valid; exists otherwise for f in files: diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 60b9fcc5..ecfe6798 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -40,7 +40,7 @@ def OptionToWrapper(x): }[x] # Buffers without 'ld' or 'inc' parameter -NO_LD_INC = ["dot","ap"] +NO_LD_INC = ["dot","nrm2","ap"] # ================================================================================================== @@ -252,7 +252,7 @@ class Routine(): # Retrieves a combination of all the argument names, with Claduc casts def ArgumentsCladuc(self, flavour, indent): - return (self.Options() + self.Sizes() + self.BufferCladuc("dot") + + return (self.Options() + self.Sizes() + self.BufferCladuc("dot") + self.BufferCladuc("nrm2") + self.Scalar("alpha") + list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) + self.Scalar("beta") + @@ -261,7 +261,7 @@ class Routine(): # Retrieves a combination of all the argument names, with CLBlast casts def ArgumentsCast(self, flavour, indent): - return (self.OptionsCast(indent) + self.Sizes() + self.Buffer("dot") + + return (self.OptionsCast(indent) + self.Sizes() + self.Buffer("dot") + self.Buffer("nrm2") + self.ScalarUse("alpha", flavour) + list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + self.ScalarUse("beta", flavour) + @@ -270,7 +270,7 @@ class Routine(): # As above, but for the clBLAS wrapper def ArgumentsWrapper(self, flavour): - return (self.Options() + self.Sizes() + self.BufferWrapper("dot") + + return (self.Options() + self.Sizes() + self.BufferWrapper("dot") + self.BufferWrapper("nrm2") + self.ScalarUseWrapper("alpha", flavour) + list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) + self.ScalarUseWrapper("beta", flavour) + @@ -279,7 +279,7 @@ class Routine(): # Retrieves a combination of all the argument definitions def ArgumentsDef(self, flavour): - return (self.OptionsDef() + self.SizesDef() + self.BufferDef("dot") + + return (self.OptionsDef() + self.SizesDef() + self.BufferDef("dot") + self.BufferDef("nrm2") + self.ScalarDef("alpha", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + self.ScalarDef("beta", flavour) + @@ -288,7 +288,7 @@ class Routine(): # As above, but clBLAS wrapper plain datatypes def ArgumentsDefWrapper(self, flavour): - return (self.OptionsDefWrapper() + self.SizesDef() + self.BufferDef("dot") + + return (self.OptionsDefWrapper() + self.SizesDef() + self.BufferDef("dot") + self.BufferDef("nrm2") + self.ScalarDefPlain("alpha", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + self.ScalarDefPlain("beta", flavour) + @@ -297,7 +297,7 @@ class Routine(): # Retrieves a combination of all the argument types def ArgumentsType(self, flavour): - return (self.OptionsType() + self.SizesType() + self.BufferType("dot") + + return (self.OptionsType() + self.SizesType() + self.BufferType("dot") + self.BufferType("nrm2") + self.ScalarType("alpha", flavour) + list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) + self.ScalarType("beta", flavour) + diff --git a/src/clblast.cc b/src/clblast.cc index e7f2477f..9079355a 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -285,6 +285,23 @@ template StatusCode PUBLIC_API Dotc(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Euclidian norm of a vector: SNRM2/DNRM2 +template +StatusCode Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 66d16f6d..d735fa35 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -279,6 +279,28 @@ StatusCode CLBlastZdotc(const size_t n, return static_cast(status); } +// NRM2 +StatusCode CLBlastSnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/test/correctness/routines/level1/xnrm2.cc b/test/correctness/routines/level1/xnrm2.cc new file mode 100644 index 00000000..8238e868 --- /dev/null +++ b/test/correctness/routines/level1/xnrm2.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xnrm2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SNRM2"); + clblast::RunTests, double, double>(argc, argv, true, "DNRM2"); + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xnrm2.cc b/test/performance/routines/level1/xnrm2.cc new file mode 100644 index 00000000..d5ae348b --- /dev/null +++ b/test/performance/routines/level1/xnrm2.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xnrm2.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 23a02a45..501f0bc5 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -350,6 +350,44 @@ clblasStatus clblasXdotc(const size_t n, num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for SNRM2/DNRM2 +template +clblasStatus clblasXnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasSnrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasDnrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= From 1d5a702d9d31afa320a15ed9fa79471aec314f4a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 25 Mar 2016 10:30:38 +0100 Subject: [PATCH 07/60] Added prototypes for ScNRM2/DzNRM2 routines --- include/clblast.h | 2 +- include/clblast_c.h | 10 ++++++- scripts/generator/datatype.py | 3 ++- scripts/generator/generator.py | 28 +++++++++++--------- src/clblast.cc | 10 ++++++- src/clblast_c.cc | 20 ++++++++++++++ test/correctness/routines/level1/xnrm2.cc | 2 ++ test/performance/routines/level1/xnrm2.cc | 6 +++-- test/wrapper_clblas.h | 32 ++++++++++++++++++++++- 9 files changed, 93 insertions(+), 20 deletions(-) diff --git a/include/clblast.h b/include/clblast.h index 905de774..d837cb71 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -140,7 +140,7 @@ StatusCode Dotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); -// Euclidian norm of a vector: SNRM2/DNRM2 +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 template StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, diff --git a/include/clblast_c.h b/include/clblast_c.h index 1e4be1ab..e93ee465 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -208,7 +208,7 @@ StatusCode PUBLIC_API CLBlastZdotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); -// Euclidian norm of a vector: SNRM2/DNRM2 +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 StatusCode PUBLIC_API CLBlastSnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -217,6 +217,14 @@ StatusCode PUBLIC_API CLBlastDnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastScnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDznrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // ================================================================================================= // BLAS level-2 (matrix-vector) routines diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py index 0aa27197..9323bc4d 100644 --- a/scripts/generator/datatype.py +++ b/scripts/generator/datatype.py @@ -22,7 +22,8 @@ D2CL = "cl_double2" # Structure holding data-type and precision information class DataType(): - def __init__(self, name, template, scalars, buffertype): + def __init__(self, precision_name, name, template, scalars, buffertype): + self.precision_name = precision_name self.name = name self.template = template self.alpha_cpp = scalars[0] diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 2c22a6fd..253f1a92 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -31,21 +31,23 @@ from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL # ================================================================================================== # Regular data-types -S = DataType("S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32) -D = DataType("D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64) -C = DataType("C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232) -Z = DataType("Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464) +S = DataType("S", "S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32) +D = DataType("D", "D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64) +C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232) +Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464) # Special cases -Css = DataType("C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S -Zdd = DataType("Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D -Ccs = DataType("C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S -Zzd = DataType("Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D +Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output +Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output +Css = DataType("C", "C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S +Zdd = DataType("Z", "Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D +Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S +Zzd = DataType("Z", "Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D # C++ template data-types -T = DataType("typename T", "T", ["T", "T", "T", "T"], "T") # regular routine -Tc = DataType("typename T", "std::complex,T", ["T", "T", "T", "T"], "std::complex") # for herk -TU = DataType("typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k +T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine +Tc = DataType("Tc", "typename T", "std::complex,T", ["T", "T", "T", "T"], "std::complex") # for herk +TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k # ================================================================================================== @@ -61,7 +63,7 @@ routines = [ Routine(True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"), Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"), Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"), - Routine(False, "1", "nrm2", T, [S,D], ["n"], [], ["x"], ["nrm2"], [], True, "Euclidian norm of a vector"), + Routine(False, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], True, "Euclidian norm of a vector"), ], [ # Level 2: matrix-vector Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"), @@ -332,7 +334,7 @@ for level in [1,2,3]: body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":" found = False for flavour in routine.flavours: - if flavour.name == precision: + if flavour.precision_name == precision: body += "\n clblast::RunClient(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Euclidian norm of a vector: SNRM2/DNRM2 +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 template StatusCode Nrm2(const size_t, cl_mem, const size_t, @@ -301,6 +301,14 @@ template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Nrm2(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-2 (matrix-vector) routines diff --git a/src/clblast_c.cc b/src/clblast_c.cc index d735fa35..fa25d4a7 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -300,6 +300,26 @@ StatusCode CLBlastDnrm2(const size_t n, queue, event); return static_cast(status); } +StatusCode CLBlastScnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDznrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} // ================================================================================================= // BLAS level-2 (matrix-vector) routines diff --git a/test/correctness/routines/level1/xnrm2.cc b/test/correctness/routines/level1/xnrm2.cc index 8238e868..97fb0ad6 100644 --- a/test/correctness/routines/level1/xnrm2.cc +++ b/test/correctness/routines/level1/xnrm2.cc @@ -20,6 +20,8 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { clblast::RunTests, float, float>(argc, argv, false, "SNRM2"); clblast::RunTests, double, double>(argc, argv, true, "DNRM2"); + clblast::RunTests, float2, float2>(argc, argv, true, "ScNRM2"); + clblast::RunTests, double2, double2>(argc, argv, true, "DzNRM2"); return 0; } diff --git a/test/performance/routines/level1/xnrm2.cc b/test/performance/routines/level1/xnrm2.cc index d5ae348b..db6ec9ad 100644 --- a/test/performance/routines/level1/xnrm2.cc +++ b/test/performance/routines/level1/xnrm2.cc @@ -24,8 +24,10 @@ int main(int argc, char *argv[]) { clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::RunClient, double, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; } return 0; } diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 501f0bc5..37d9eee5 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -350,7 +350,7 @@ clblasStatus clblasXdotc(const size_t n, num_queues, queues, num_wait_events, wait_events, events); } -// Forwards the clBLAS calls for SNRM2/DNRM2 +// Forwards the clBLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 template clblasStatus clblasXnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, @@ -387,6 +387,36 @@ clblasStatus clblasXnrm2(const size_t n, scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } +template <> +clblasStatus clblasXnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasScnrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + return clblasDznrm2(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} // ================================================================================================= // BLAS level-2 (matrix-vector) routines From aaa687ca984b18bd1ea499c92285b490fd78e2a3 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 28 Mar 2016 23:00:44 +0200 Subject: [PATCH 08/60] Added preliminary support for the xNRM2 routines --- CMakeLists.txt | 2 +- include/internal/routines/level1/xnrm2.h | 55 +++++++++++ include/internal/utilities.h | 6 +- scripts/generator/generator.py | 92 ++++++++--------- src/clblast.cc | 18 +++- src/kernels/level1/xnrm2.opencl | 120 +++++++++++++++++++++++ src/routines/level1/xnrm2.cc | 107 ++++++++++++++++++++ test/correctness/testblas.cc | 28 +++--- test/correctness/testblas.h | 14 ++- test/performance/client.cc | 11 ++- test/routines/level1/xdot.h | 10 +- test/routines/level1/xdotc.h | 10 +- test/routines/level1/xdotu.h | 10 +- test/routines/level1/xnrm2.h | 117 ++++++++++++++++++++++ test/wrapper_clblas.h | 44 ++++----- 15 files changed, 529 insertions(+), 115 deletions(-) create mode 100644 include/internal/routines/level1/xnrm2.h create mode 100644 src/kernels/level1/xnrm2.opencl create mode 100644 src/routines/level1/xnrm2.cc create mode 100644 test/routines/level1/xnrm2.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 65debdf4..db73c83e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,7 +118,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS}) set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_C sgemm) -set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc) +set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) diff --git a/include/internal/routines/level1/xnrm2.h b/include/internal/routines/level1/xnrm2.h new file mode 100644 index 00000000..b3fffef6 --- /dev/null +++ b/include/internal/routines/level1/xnrm2.h @@ -0,0 +1,55 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xnrm2 routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XNRM2_H_ +#define CLBLAST_ROUTINES_XNRM2_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xnrm2: public Routine { + public: + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::TestVectorDot; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xnrm2(Queue &queue, Event &event, const std::string &name = "NRM2"); + + // Templated-precision implementation of the routine + StatusCode DoNrm2(const size_t n, + const Buffer &nrm2_buffer, const size_t nrm2_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XNRM2_H_ +#endif diff --git a/include/internal/utilities.h b/include/internal/utilities.h index b6307a85..35f76722 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -61,6 +61,7 @@ constexpr auto kArgBOffset = "offb"; constexpr auto kArgCOffset = "offc"; constexpr auto kArgAPOffset = "offap"; constexpr auto kArgDotOffset = "offdot"; +constexpr auto kArgNrm2Offset = "offnrm2"; constexpr auto kArgAlpha = "alpha"; constexpr auto kArgBeta = "beta"; @@ -113,6 +114,7 @@ struct Arguments { size_t c_offset = 0; size_t ap_offset = 0; size_t dot_offset = 0; + size_t nrm2_offset = 0; T alpha = T{1.0}; T beta = T{1.0}; size_t x_size = 1; @@ -121,7 +123,7 @@ struct Arguments { size_t b_size = 1; size_t c_size = 1; size_t ap_size = 1; - size_t dot_size = 1; + size_t scalar_size = 1; // Tuner-specific arguments double fraction = 1.0; // Client-specific arguments @@ -149,7 +151,7 @@ struct Buffers { Buffer b_mat; Buffer c_mat; Buffer ap_mat; - Buffer dot; + Buffer scalar; }; // ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 253f1a92..c316bea6 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -54,55 +54,55 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # Populates a list of routines routines = [ [ # Level 1: vector-vector - #Routine(False, "1", "rotg", T, [S,D], [], [], [], [], ["a","b","c","s"], False, "Generate plane rotation"), - #Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["c","s"], False, "Apply plane rotation"), - Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"), - Routine(True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"), - Routine(True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"), - Routine(True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], False, "Vector-times-constant plus vector"), - Routine(True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"), - Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"), - Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"), - Routine(False, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], True, "Euclidian norm of a vector"), + #Routine(False, "1", "rotg", T, [S,D], [], [], [], [], ["a","b","c","s"], "", "Generate plane rotation"), + #Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["c","s"], "", "Apply plane rotation"), + Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"), + Routine(True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling"), + Routine(True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy"), + Routine(True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector"), + Routine(True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors"), + Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors"), + Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), + Routine(True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), ], [ # Level 2: matrix-vector - Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"), - Routine(True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"), - Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"), - Routine(True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"), - Routine(True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"), - Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"), - Routine(True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"), - Routine(True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"), - Routine(True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"), - Routine(True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"), - Routine(True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"), - Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"), - Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"), - Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"), + Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), + Routine(True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication"), + Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication"), + Routine(True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication"), + Routine(True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication"), + Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication"), + Routine(True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication"), + Routine(True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication"), + Routine(True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication"), + Routine(True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication"), + Routine(True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication"), + Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations"), + Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations"), + Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations"), # Level 2: matrix update - Routine(True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"), - Routine(True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"), - Routine(True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"), - Routine(True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"), - Routine(True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"), - Routine(True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"), - Routine(True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"), - Routine(True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"), - Routine(True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"), - Routine(True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"), - Routine(True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"), + Routine(True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update"), + Routine(True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update"), + Routine(True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update"), + Routine(True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update"), + Routine(True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update"), + Routine(True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update"), + Routine(True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update"), + Routine(True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update"), + Routine(True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update"), + Routine(True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update"), + Routine(True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update"), ], [ # Level 3: matrix-matrix - Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"), - Routine(True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Symmetric matrix-matrix multiplication"), - Routine(True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Hermitian matrix-matrix multiplication"), - Routine(True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a symmetric matrix"), - Routine(True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a hermitian matrix"), - Routine(True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a symmetric matrix"), - Routine(True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a hermitian matrix"), - Routine(True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Triangular matrix-matrix multiplication"), - Routine(False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Solves a triangular system of equations"), + Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication"), + Routine(True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication"), + Routine(True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication"), + Routine(True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix"), + Routine(True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix"), + Routine(True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix"), + Routine(True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix"), + Routine(True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication"), + Routine(False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations"), ]] # ================================================================================================== @@ -226,7 +226,7 @@ def wrapper_clblas(routines): if routine.scratch: result += " auto queue = Queue(queues[0]);\n" result += " auto context = queue.GetContext();\n" - result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n*x_inc + x_offset);\n" + result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n" arguments += ["scratch_buffer()"] result += " return clblas"+flavour.name+routine.name+"(" result += (",\n"+indent).join([a for a in arguments]) @@ -250,7 +250,7 @@ files = [ path_clblast+"/src/clblast_c.cc", path_clblast+"/test/wrapper_clblas.h", ] -header_lines = [84, 64, 93, 22, 22] +header_lines = [84, 65, 93, 22, 22] footer_lines = [6, 3, 9, 2, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/src/clblast.cc b/src/clblast.cc index 2c940380..6f75540d 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -26,6 +26,7 @@ #include "internal/routines/level1/xdot.h" #include "internal/routines/level1/xdotu.h" #include "internal/routines/level1/xdotc.h" +#include "internal/routines/level1/xnrm2.h" // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" @@ -287,11 +288,18 @@ template StatusCode PUBLIC_API Dotc(const size_t, // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 template -StatusCode Nrm2(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Nrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto event_cpp = Event(*event); + auto routine = Xnrm2(queue_cpp, event_cpp); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoNrm2(n, + Buffer(nrm2_buffer), nrm2_offset, + Buffer(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl new file mode 100644 index 00000000..c50d7d63 --- /dev/null +++ b/src/kernels/level1/xnrm2.opencl @@ -0,0 +1,120 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Xnrm2 kernel. It implements a dot-product computation using reduction +// kernels. Reduction is split in two parts. In the first (main) kernel the X and Y vectors are +// multiplied, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel +// is executed with a single workgroup only, computing the final result. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef WGS1 + #define WGS1 64 // The local work-group size of the main kernel +#endif +#ifndef WGS2 + #define WGS2 64 // The local work-group size of the epilogue kernel +#endif + +// ================================================================================================= + +// The main reduction kernel, performing the multiplication and the majority of the sum operation +__attribute__((reqd_work_group_size(WGS1, 1, 1))) +__kernel void Xnrm2(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* output) { + __local real lm[WGS1]; + const int lid = get_local_id(0); + const int wgid = get_group_id(0); + const int num_groups = get_num_groups(0); + + // Performs multiplication and the first steps of the reduction + real acc; + SetToZero(acc); + int id = wgid*WGS1 + lid; + while (id < n) { + real x1 = xgm[id*x_inc + x_offset]; + real x2 = x1; + COMPLEX_CONJUGATE(x2); + MultiplyAdd(acc, x1, x2); + id += WGS1*num_groups; + } + lm[lid] = acc; + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS1/2; s>0; s=s>>1) { + if (lid < s) { + Add(lm[lid], lm[lid], lm[lid + s]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the per-workgroup result + if (lid == 0) { + output[wgid] = lm[0]; + } +} + +// ================================================================================================= + +// Computes the square root +inline real SquareRoot(const real z) { + #if PRECISION == 3232 || PRECISION == 6464 + double r = sqrt(z.x * z.x + z.y * z.y); + real zpr; zpr.x = z.x + r; zpr.y = z.y; + double zprabs = sqrt(zpr.x * zpr.x + zpr.y + zpr.y); + real result; + result.x = sqrt(r) * zpr.x / zprabs; + result.y = sqrt(r) * zpr.y / zprabs; + return result; + #else + return sqrt(z); + #endif +} + +// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to +// be launched with a single workgroup only. +__attribute__((reqd_work_group_size(WGS2, 1, 1))) +__kernel void Xnrm2Epilogue(const __global real* restrict input, + __global real* nrm2, const int nrm2_offset) { + __local real lm[WGS2]; + const int lid = get_local_id(0); + + // Performs the first step of the reduction while loading the data + Add(lm[lid], input[lid], input[lid + WGS2]); + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS2/2; s>0; s=s>>1) { + if (lid < s) { + Add(lm[lid], lm[lid], lm[lid + s]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Computes the square root and stores the final result + if (lid == 0) { + nrm2[nrm2_offset] = SquareRoot(lm[0]); + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc new file mode 100644 index 00000000..064e68bf --- /dev/null +++ b/src/routines/level1/xnrm2.cc @@ -0,0 +1,107 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xnrm2 class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xnrm2.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xnrm2::precision_ = Precision::kSingle; +template <> const Precision Xnrm2::precision_ = Precision::kDouble; +template <> const Precision Xnrm2::precision_ = Precision::kComplexSingle; +template <> const Precision Xnrm2::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xnrm2::Xnrm2(Queue &queue, Event &event, const std::string &name): + Routine(queue, event, name, {"Xdot"}, precision_) { + source_string_ = + #include "../../kernels/level1/xnrm2.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xnrm2::DoNrm2(const size_t n, + const Buffer &nrm2_buffer, const size_t nrm2_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorDot(1, nrm2_buffer, nrm2_offset, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Retrieves the Xnrm2 kernels from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel1 = Kernel(program, "Xnrm2"); + auto kernel2 = Kernel(program, "Xnrm2Epilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast(x_offset)); + kernel1.SetArgument(3, static_cast(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Launches the main kernel + auto global1 = std::vector{db_["WGS1"]*temp_size}; + auto local1 = std::vector{db_["WGS1"]}; + status = RunKernel(kernel1, global1, local1); + if (ErrorIn(status)) { return status; } + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, nrm2_buffer()); + kernel2.SetArgument(2, static_cast(nrm2_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector{db_["WGS2"]}; + auto local2 = std::vector{db_["WGS2"]}; + status = RunKernel(kernel2, global2, local2); + if (ErrorIn(status)) { return status; } + + // Waits for all kernels to finish + queue_.Finish(); + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xnrm2; +template class Xnrm2; +template class Xnrm2; +template class Xnrm2; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index febd7504..1329b2c5 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -58,14 +58,14 @@ TestBlas::TestBlas(int argc, char *argv[], const bool silent, b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset); - dot_source_.resize(std::max(max_mat, max_matvec) + max_offset); + scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset); PopulateVector(x_source_); PopulateVector(y_source_); PopulateVector(a_source_); PopulateVector(b_source_); PopulateVector(c_source_); PopulateVector(ap_source_); - PopulateVector(dot_source_); + PopulateVector(scalar_source_); } // =============================================================================================== @@ -86,15 +86,15 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st auto b_mat1 = Buffer(context_, args.b_size); auto c_mat1 = Buffer(context_, args.c_size); auto ap_mat1 = Buffer(context_, args.ap_size); - auto dot1 = Buffer(context_, args.dot_size); + auto scalar1 = Buffer(context_, args.scalar_size); x_vec1.Write(queue_, args.x_size, x_source_); y_vec1.Write(queue_, args.y_size, y_source_); a_mat1.Write(queue_, args.a_size, a_source_); b_mat1.Write(queue_, args.b_size, b_source_); c_mat1.Write(queue_, args.c_size, c_source_); ap_mat1.Write(queue_, args.ap_size, ap_source_); - dot1.Write(queue_, args.dot_size, dot_source_); - auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, dot1}; + scalar1.Write(queue_, args.scalar_size, scalar_source_); + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; auto status1 = run_reference_(args, buffers1, queue_); // Runs the CLBlast code @@ -104,15 +104,15 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st auto b_mat2 = Buffer(context_, args.b_size); auto c_mat2 = Buffer(context_, args.c_size); auto ap_mat2 = Buffer(context_, args.ap_size); - auto dot2 = Buffer(context_, args.dot_size); + auto scalar2 = Buffer(context_, args.scalar_size); x_vec2.Write(queue_, args.x_size, x_source_); y_vec2.Write(queue_, args.y_size, y_source_); a_mat2.Write(queue_, args.a_size, a_source_); b_mat2.Write(queue_, args.b_size, b_source_); c_mat2.Write(queue_, args.c_size, c_source_); ap_mat2.Write(queue_, args.ap_size, ap_source_); - dot2.Write(queue_, args.dot_size, dot_source_); - auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, dot2}; + scalar2.Write(queue_, args.scalar_size, scalar_source_); + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; auto status2 = run_routine_(args, buffers2, queue_); // Tests for equality of the two status codes @@ -162,32 +162,32 @@ void TestBlas::TestInvalid(std::vector> &test_vector, const st auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); auto ap1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.dot_size*sizeof(T), nullptr,nullptr); + auto d1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); auto x_vec1 = Buffer(x1); auto y_vec1 = Buffer(y1); auto a_mat1 = Buffer(a1); auto b_mat1 = Buffer(b1); auto c_mat1 = Buffer(c1); auto ap_mat1 = Buffer(ap1); - auto dot1 = Buffer(d1); + auto scalar1 = Buffer(d1); auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr); auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr); auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr); auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr); auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr); auto ap2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.ap_size*sizeof(T), nullptr,nullptr); - auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.dot_size*sizeof(T), nullptr,nullptr); + auto d2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.scalar_size*sizeof(T), nullptr,nullptr); auto x_vec2 = Buffer(x2); auto y_vec2 = Buffer(y2); auto a_mat2 = Buffer(a2); auto b_mat2 = Buffer(b2); auto c_mat2 = Buffer(c2); auto ap_mat2 = Buffer(ap2); - auto dot2 = Buffer(d2); + auto scalar2 = Buffer(d2); // Runs the two routines - auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, dot1}; - auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, dot2}; + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; + auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; auto status1 = run_reference_(args, buffers1, queue_); auto status2 = run_routine_(args, buffers2, queue_); diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index bfd1763c..7c9032bd 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -92,7 +92,7 @@ class TestBlas: public Tester { std::vector b_source_; std::vector c_source_; std::vector ap_source_; - std::vector dot_source_; + std::vector scalar_source_; // The routine-specific functions passed to the tester Routine run_routine_; @@ -143,6 +143,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto c_offsets = std::vector{args.c_offset}; auto ap_offsets = std::vector{args.ap_offset}; auto dot_offsets = std::vector{args.dot_offset}; + auto nrm2_offsets = std::vector{args.nrm2_offset}; auto alphas = std::vector{args.alpha}; auto betas = std::vector{args.beta}; auto x_sizes = std::vector{args.x_size}; @@ -182,6 +183,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgCOffset) { c_offsets = tester.kOffsets; } if (option == kArgAPOffset) { ap_offsets = tester.kOffsets; } if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; } + if (option == kArgNrm2Offset) { nrm2_offsets = tester.kOffsets; } if (option == kArgAlpha) { alphas = tester.kAlphaValues; } if (option == kArgBeta) { betas = tester.kBetaValues; } @@ -221,10 +223,12 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name for (auto &c_offset: c_offsets) { r_args.c_offset = c_offset; for (auto &ap_offset: ap_offsets) { r_args.ap_offset = ap_offset; for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; - for (auto &alpha: alphas) { r_args.alpha = alpha; - for (auto &beta: betas) { r_args.beta = beta; - C::SetSizes(r_args); - regular_test_vector.push_back(r_args); + for (auto &nrm2_offset: nrm2_offsets) { r_args.nrm2_offset = nrm2_offset; + for (auto &alpha: alphas) { r_args.alpha = alpha; + for (auto &beta: betas) { r_args.beta = beta; + C::SetSizes(r_args); + regular_test_vector.push_back(r_args); + } } } } diff --git a/test/performance/client.cc b/test/performance/client.cc index ebfad3a6..17f54231 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -136,14 +136,14 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) std::vector b_source(args.b_size); std::vector c_source(args.c_size); std::vector ap_source(args.ap_size); - std::vector dot_source(args.dot_size); + std::vector scalar_source(args.scalar_size); PopulateVector(x_source); PopulateVector(y_source); PopulateVector(a_source); PopulateVector(b_source); PopulateVector(c_source); PopulateVector(ap_source); - PopulateVector(dot_source); + PopulateVector(scalar_source); // Creates the matrices on the device auto x_vec = Buffer(context, args.x_size); @@ -152,15 +152,15 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) auto b_mat = Buffer(context, args.b_size); auto c_mat = Buffer(context, args.c_size); auto ap_mat = Buffer(context, args.ap_size); - auto dot = Buffer(context, args.dot_size); + auto scalar = Buffer(context, args.scalar_size); x_vec.Write(queue, args.x_size, x_source); y_vec.Write(queue, args.y_size, y_source); a_mat.Write(queue, args.a_size, a_source); b_mat.Write(queue, args.b_size, b_source); c_mat.Write(queue, args.c_size, c_source); ap_mat.Write(queue, args.ap_size, ap_source); - dot.Write(queue, args.dot_size, dot_source); - auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, dot}; + scalar.Write(queue, args.scalar_size, scalar_source); + auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar}; // Runs the routines and collects the timings auto timings = std::vector>(); @@ -267,6 +267,7 @@ void Client::PrintTableRow(const Arguments& args, else if (o == kArgCOffset) { integers.push_back(args.c_offset); } else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); } else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } + else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } } auto strings = std::vector{}; for (auto &o: options_) { diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.h index bfcfdaff..04669f52 100644 --- a/test/routines/level1/xdot.h +++ b/test/routines/level1/xdot.h @@ -54,7 +54,7 @@ class TestXdot { static void SetSizes(Arguments &args) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); - args.dot_size = GetSizeDot(args); + args.scalar_size = GetSizeDot(args); } // Describes what the default values of the leading dimensions of the matrices are @@ -72,7 +72,7 @@ class TestXdot { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dot(args.n, - buffers.dot(), args.dot_offset, + buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); @@ -85,7 +85,7 @@ class TestXdot { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdot(args.n, - buffers.dot(), args.dot_offset, + buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); @@ -95,8 +95,8 @@ class TestXdot { // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { - std::vector result(args.dot_size, static_cast(0)); - buffers.dot.Read(queue, args.dot_size, result); + std::vector result(args.scalar_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, result); return result; } diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.h index e403ba4c..e5b42ef4 100644 --- a/test/routines/level1/xdotc.h +++ b/test/routines/level1/xdotc.h @@ -54,7 +54,7 @@ class TestXdotc { static void SetSizes(Arguments &args) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); - args.dot_size = GetSizeDot(args); + args.scalar_size = GetSizeDot(args); } // Describes what the default values of the leading dimensions of the matrices are @@ -72,7 +72,7 @@ class TestXdotc { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotc(args.n, - buffers.dot(), args.dot_offset, + buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); @@ -85,7 +85,7 @@ class TestXdotc { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotc(args.n, - buffers.dot(), args.dot_offset, + buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); @@ -95,8 +95,8 @@ class TestXdotc { // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { - std::vector result(args.dot_size, static_cast(0)); - buffers.dot.Read(queue, args.dot_size, result); + std::vector result(args.scalar_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, result); return result; } diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.h index 8b2c65a8..6430148c 100644 --- a/test/routines/level1/xdotu.h +++ b/test/routines/level1/xdotu.h @@ -54,7 +54,7 @@ class TestXdotu { static void SetSizes(Arguments &args) { args.x_size = GetSizeX(args); args.y_size = GetSizeY(args); - args.dot_size = GetSizeDot(args); + args.scalar_size = GetSizeDot(args); } // Describes what the default values of the leading dimensions of the matrices are @@ -72,7 +72,7 @@ class TestXdotu { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotu(args.n, - buffers.dot(), args.dot_offset, + buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, &queue_plain, &event); @@ -85,7 +85,7 @@ class TestXdotu { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotu(args.n, - buffers.dot(), args.dot_offset, + buffers.scalar(), args.dot_offset, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); @@ -95,8 +95,8 @@ class TestXdotu { // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { - std::vector result(args.dot_size, static_cast(0)); - buffers.dot.Read(queue, args.dot_size, result); + std::vector result(args.scalar_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, result); return result; } diff --git a/test/routines/level1/xnrm2.h b/test/routines/level1/xnrm2.h new file mode 100644 index 00000000..e3f77ee4 --- /dev/null +++ b/test/routines/level1/xnrm2.h @@ -0,0 +1,117 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xnrm2 routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XNRM2_H_ +#define CLBLAST_TEST_ROUTINES_XNRM2_H_ + +#include +#include + +#include "wrapper_clblas.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXnrm2 { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, + kArgXOffset, kArgNrm2Offset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeNrm2(const Arguments &args) { + return 1 + args.nrm2_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.scalar_size = GetSizeNrm2(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Nrm2(args.n, + buffers.scalar(), args.nrm2_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXnrm2(args.n, + buffers.scalar(), args.nrm2_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.scalar_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { + return args.nrm2_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((args.n) + 1) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XNRM2_H_ +#endif diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 37d9eee5..42bb8f92 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -238,7 +238,7 @@ clblasStatus clblasXdot(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasSdot(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast(x_inc), @@ -255,7 +255,7 @@ clblasStatus clblasXdot(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasDdot(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast(x_inc), @@ -281,7 +281,7 @@ clblasStatus clblasXdotu(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasCdotu(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast(x_inc), @@ -298,7 +298,7 @@ clblasStatus clblasXdotu(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasZdotu(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast(x_inc), @@ -324,7 +324,7 @@ clblasStatus clblasXdotc(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasCdotc(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast(x_inc), @@ -341,7 +341,7 @@ clblasStatus clblasXdotc(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasZdotc(n, dot_buffer, dot_offset, x_buffer, x_offset, static_cast(x_inc), @@ -365,7 +365,7 @@ clblasStatus clblasXnrm2(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, 2*n); return clblasSnrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, static_cast(x_inc), @@ -380,7 +380,7 @@ clblasStatus clblasXnrm2(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, 2*n); return clblasDnrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, static_cast(x_inc), @@ -395,7 +395,7 @@ clblasStatus clblasXnrm2(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, 2*n); return clblasScnrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, static_cast(x_inc), @@ -410,7 +410,7 @@ clblasStatus clblasXnrm2(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, 2*n); return clblasDznrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, static_cast(x_inc), @@ -815,7 +815,7 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo trian cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasStrmv(layout, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, @@ -832,7 +832,7 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasDtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, @@ -849,7 +849,7 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasCtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, @@ -866,7 +866,7 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo tri cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasZtrmv(layout, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, @@ -892,7 +892,7 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo trian cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasStbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, @@ -909,7 +909,7 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasDtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, @@ -926,7 +926,7 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasCtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, @@ -943,7 +943,7 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo tri cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasZtbmv(layout, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, @@ -969,7 +969,7 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo trian cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasStpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, @@ -986,7 +986,7 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasDtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, @@ -1003,7 +1003,7 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo tria cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasCtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, @@ -1020,7 +1020,7 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo tri cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n*x_inc + x_offset); + auto scratch_buffer = Buffer(context, n); return clblasZtpmv(layout, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, From 2429ad50251fbb28a7ce2f517b4b81129ada8882 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 30 Mar 2016 11:34:23 +0200 Subject: [PATCH 09/60] Fixed properly passing of OpenCL events to CLBlast functions --- include/internal/clpp11.h | 18 +++++----- src/clblast.cc | 74 +++++++++++++++++++-------------------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index 104a6436..a705c6b7 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -73,30 +73,30 @@ class Event { public: // Constructor based on the regular OpenCL data-type - explicit Event(const cl_event event): event_(event) { } + explicit Event(cl_event* event): event_(event) { } - // Regular constructor + // Constructor based on a non-existant event explicit Event(): event_(nullptr) { } // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation: // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx float GetElapsedTime() const { - CheckError(clWaitForEvents(1, &event_)); + CheckError(clWaitForEvents(1, event_)); auto bytes = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); + clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); auto time_start = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); + clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); + clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); auto time_end = size_t{0}; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); + clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); return (time_end - time_start) * 1.0e-6f; } // Accessor to the private data-member - cl_event& operator()() { return event_; } + cl_event& operator()() { return *event_; } private: - cl_event event_; + cl_event* event_; }; // ================================================================================================= diff --git a/src/clblast.cc b/src/clblast.cc index e7f2477f..0b8de40a 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -74,7 +74,7 @@ StatusCode Swap(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xswap(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -106,7 +106,7 @@ StatusCode Scal(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xscal(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -138,7 +138,7 @@ StatusCode Copy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xcopy(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -171,7 +171,7 @@ StatusCode Axpy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xaxpy(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -209,7 +209,7 @@ StatusCode Dot(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xdot(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -237,7 +237,7 @@ StatusCode Dotu(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xdotu(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -265,7 +265,7 @@ StatusCode Dotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xdotc(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -300,7 +300,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xgemv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -356,7 +356,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xgbmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -412,7 +412,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xhemv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -452,7 +452,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xhbmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -492,7 +492,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xhpmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -532,7 +532,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xsymv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -572,7 +572,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xsbmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -612,7 +612,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xspmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -649,7 +649,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xtrmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -687,7 +687,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xtbmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -725,7 +725,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xtpmv(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -855,7 +855,7 @@ StatusCode Ger(const Layout layout, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xger(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -891,7 +891,7 @@ StatusCode Geru(const Layout layout, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xgeru(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -927,7 +927,7 @@ StatusCode Gerc(const Layout layout, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xgerc(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -962,7 +962,7 @@ StatusCode Her(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xher,T>(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -994,7 +994,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xhpr,T>(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1027,7 +1027,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xher2(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1063,7 +1063,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xhpr2(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1098,7 +1098,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xsyr(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1130,7 +1130,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xspr(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1163,7 +1163,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xsyr2(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1199,7 +1199,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xspr2(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1240,7 +1240,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xgemm(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1296,7 +1296,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xsymm(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1352,7 +1352,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xhemm(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1391,7 +1391,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xsyrk(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1441,7 +1441,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xherk,T>(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1478,7 +1478,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xsyr2k(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1534,7 +1534,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xher2k(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } @@ -1572,7 +1572,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(*event); + auto event_cpp = Event(event); auto routine = Xtrmm(queue_cpp, event_cpp); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } From 6f561abada9abb54987ecea0cf10f547e2e88422 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 30 Mar 2016 15:05:59 +0200 Subject: [PATCH 10/60] Added missing newline to the end of the public API file --- include/internal/public_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/internal/public_api.h b/include/internal/public_api.h index 08a55c6a..d0732297 100644 --- a/include/internal/public_api.h +++ b/include/internal/public_api.h @@ -31,4 +31,4 @@ namespace clblast { } // namespace clblast // CLBLAST_PUBLIC_API_H_ -#endif \ No newline at end of file +#endif From 6e5f558746eec09eda6132754649419430a86f41 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 30 Mar 2016 15:31:45 +0200 Subject: [PATCH 11/60] Made event an optional argument in the CLBlast C++ API --- include/clblast.h | 82 +++++++++++++++++----------------- include/internal/clpp11.h | 3 -- scripts/generator/generator.py | 10 ++--- scripts/generator/routine.py | 4 +- 4 files changed, 48 insertions(+), 51 deletions(-) diff --git a/include/clblast.h b/include/clblast.h index 70a3b5bc..2d03b096 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -92,21 +92,21 @@ template StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL template StatusCode Scal(const size_t n, const T alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY template StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY template @@ -114,7 +114,7 @@ StatusCode Axpy(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Dot product of two vectors: SDOT/DDOT template @@ -122,7 +122,7 @@ StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Dot product of two complex vectors: CDOTU/ZDOTU template @@ -130,7 +130,7 @@ StatusCode Dotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template @@ -138,7 +138,7 @@ StatusCode Dotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -153,7 +153,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV template @@ -164,7 +164,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template @@ -175,7 +175,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template @@ -186,7 +186,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template @@ -197,7 +197,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Symmetric matrix-vector multiplication: SSYMV/DSYMV template @@ -208,7 +208,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV template @@ -219,7 +219,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV template @@ -230,7 +230,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV template @@ -238,7 +238,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV template @@ -246,7 +246,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV template @@ -254,7 +254,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template @@ -262,7 +262,7 @@ StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template @@ -270,7 +270,7 @@ StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template @@ -278,7 +278,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // General rank-1 matrix update: SGER/DGER template @@ -288,7 +288,7 @@ StatusCode Ger(const Layout layout, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // General rank-1 complex matrix update: CGERU/ZGERU template @@ -298,7 +298,7 @@ StatusCode Geru(const Layout layout, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template @@ -308,7 +308,7 @@ StatusCode Gerc(const Layout layout, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Hermitian rank-1 matrix update: CHER/ZHER template @@ -317,7 +317,7 @@ StatusCode Her(const Layout layout, const Triangle triangle, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template @@ -326,7 +326,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Hermitian rank-2 matrix update: CHER2/ZHER2 template @@ -336,7 +336,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template @@ -346,7 +346,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Symmetric rank-1 matrix update: SSYR/DSYR template @@ -355,7 +355,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Symmetric packed rank-1 matrix update: SSPR/DSPR template @@ -364,7 +364,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Symmetric rank-2 matrix update: SSYR2/DSYR2 template @@ -374,7 +374,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2 template @@ -384,7 +384,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -399,7 +399,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM template @@ -410,7 +410,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template @@ -421,7 +421,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK template @@ -431,7 +431,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Rank-K update of a hermitian matrix: CHERK/ZHERK template @@ -441,7 +441,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K template @@ -452,7 +452,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template @@ -463,7 +463,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM template @@ -472,7 +472,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template @@ -481,7 +481,7 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); + cl_command_queue* queue, cl_event* event = nullptr); // ================================================================================================= } // namespace clblast diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index a705c6b7..aac66396 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -75,9 +75,6 @@ class Event { // Constructor based on the regular OpenCL data-type explicit Event(cl_event* event): event_(event) { } - // Constructor based on a non-existant event - explicit Event(): event_(nullptr) { } - // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation: // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 8ff5e130..5163b1ca 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -151,7 +151,7 @@ def clblast_h(routines): result = "" for routine in routines: result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" - result += routine.RoutineHeaderCPP(12)+";\n" + result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n" return result # The C++ API implementation (.cc) @@ -161,9 +161,9 @@ def clblast_cc(routines): indent1 = " "*(20 + routine.Length()) result += "\n// "+routine.description+": "+routine.ShortNames()+"\n" if routine.implemented: - result += routine.RoutineHeaderCPP(12)+" {\n" + result += routine.RoutineHeaderCPP(12, "")+" {\n" result += " auto queue_cpp = Queue(*queue);\n" - result += " auto event_cpp = Event(*event);\n" + result += " auto event_cpp = Event(event);\n" result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event_cpp);\n" result += " auto status = routine.SetUp();\n" result += " if (status != StatusCode::kSuccess) { return status; }\n" @@ -247,8 +247,8 @@ files = [ path_clblast+"/src/clblast_c.cc", path_clblast+"/test/wrapper_clblas.h", ] -header_lines = [84, 64, 88, 24, 22] -footer_lines = [6, 3, 5, 2, 6] +header_lines = [84, 64, 93, 22, 22] +footer_lines = [6, 3, 9, 2, 6] # Checks whether the command-line arguments are valid; exists otherwise for f in files: diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 60b9fcc5..1086cecc 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -308,12 +308,12 @@ class Routine(): # ============================================================================================== # Retrieves the C++ templated definition for a routine - def RoutineHeaderCPP(self, spaces): + def RoutineHeaderCPP(self, spaces, default_event): indent = " "*(spaces + self.Length()) result = "template <"+self.template.name+">\n" result += "StatusCode "+self.name.capitalize()+"(" result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)]) - result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)" + result += ",\n"+indent+"cl_command_queue* queue, cl_event* event"+default_event+")" return result # As above, but now without variable names From 6ecc0d089c80296cce3089734771279a30783f81 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 30 Mar 2016 16:17:16 +0200 Subject: [PATCH 12/60] Added prototypes for the xROT and xROTG functions --- include/clblast.h | 17 ++++++ include/clblast_c.h | 26 +++++++++ scripts/generator/generator.py | 4 +- scripts/generator/routine.py | 49 ++++++++++------- src/clblast.cc | 43 +++++++++++++++ src/clblast_c.cc | 56 +++++++++++++++++++ test/correctness/routines/level1/xrot.cc | 26 +++++++++ test/correctness/routines/level1/xrotg.cc | 26 +++++++++ test/performance/routines/level1/xrot.cc | 33 ++++++++++++ test/performance/routines/level1/xrotg.cc | 33 ++++++++++++ test/wrapper_clblas.h | 65 +++++++++++++++++++++++ 11 files changed, 357 insertions(+), 21 deletions(-) create mode 100644 test/correctness/routines/level1/xrot.cc create mode 100644 test/correctness/routines/level1/xrotg.cc create mode 100644 test/performance/routines/level1/xrot.cc create mode 100644 test/performance/routines/level1/xrotg.cc diff --git a/include/clblast.h b/include/clblast.h index 2d03b096..a5fd30f8 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -87,6 +87,23 @@ enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, // BLAS level-1 (vector-vector) routines // ================================================================================================= +// Generate plane rotation: SROTG/DROTG +template +StatusCode Rotg(cl_mem SA_buffer, const size_t SA_offset, + cl_mem SB_buffer, const size_t SB_offset, + cl_mem C_buffer, const size_t C_offset, + cl_mem S_buffer, const size_t S_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Apply plane rotation: SROT/DROT +template +StatusCode Rot(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const T C, + const T S, + cl_command_queue* queue, cl_event* event = nullptr); + // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP template StatusCode Swap(const size_t n, diff --git a/include/clblast_c.h b/include/clblast_c.h index c5395e51..be5bab57 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -96,6 +96,32 @@ typedef enum Precision_ { kHalf = 16, kSingle = 32, kDouble = 64, // BLAS level-1 (vector-vector) routines // ================================================================================================= +// Generate plane rotation: SROTG/DROTG +StatusCode PUBLIC_API CLBlastSrotg(cl_mem SA_buffer, const size_t SA_offset, + cl_mem SB_buffer, const size_t SB_offset, + cl_mem C_buffer, const size_t C_offset, + cl_mem S_buffer, const size_t S_offset, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDrotg(cl_mem SA_buffer, const size_t SA_offset, + cl_mem SB_buffer, const size_t SB_offset, + cl_mem C_buffer, const size_t C_offset, + cl_mem S_buffer, const size_t S_offset, + cl_command_queue* queue, cl_event* event); + +// Apply plane rotation: SROT/DROT +StatusCode PUBLIC_API CLBlastSrot(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const float C, + const float S, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDrot(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const double C, + const double S, + cl_command_queue* queue, cl_event* event); + // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP StatusCode PUBLIC_API CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 5163b1ca..7191cba1 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -52,8 +52,8 @@ TU = DataType("typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for # Populates a list of routines routines = [ [ # Level 1: vector-vector - #Routine(False, "1", "rotg", T, [S,D], [], [], [], [], ["a","b","c","s"], False, "Generate plane rotation"), - #Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["c","s"], False, "Apply plane rotation"), + Routine(False, "1", "rotg", T, [S,D], [], [], [], ["SA","SB","C","S"], [], False, "Generate plane rotation"), + Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["C","S"], False, "Apply plane rotation"), Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"), Routine(True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"), Routine(True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"), diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 1086cecc..d74def25 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -39,9 +39,6 @@ def OptionToWrapper(x): 'diagonal': "clblasDiag", }[x] -# Buffers without 'ld' or 'inc' parameter -NO_LD_INC = ["dot","ap"] - # ================================================================================================== # Class holding routine-specific information (e.g. name, which arguments, which precisions) @@ -61,6 +58,14 @@ class Routine(): self.scratch = scratch # Scratch buffer (e.g. for xDOT) self.description = description + # List of scalar buffers + def ScalarBuffers(self): + return ["SA","SB","C","S","dot"] + + # List of buffers without 'inc' or 'ld' + def BuffersWithoutLdInc(self): + return self.ScalarBuffers() + ["ap"] + # Retrieves the number of characters in the routine's name def Length(self): return len(self.name) @@ -94,7 +99,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = [name+"_buffer"] b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] + c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] return [", ".join(a+b+c)] return [] @@ -104,7 +109,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = [prefix+"cl_mem "+name+"_buffer"] b = ["const size_t "+name+"_offset"] - c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] + c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] return [", ".join(a+b+c)] return [] @@ -113,7 +118,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = ["Buffer<"+self.template.buffertype+">("+name+"_buffer)"] b = [name+"_offset"] - c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else [] + c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] return [", ".join(a+b+c)] return [] @@ -136,7 +141,7 @@ class Routine(): if (name in self.inputs) or (name in self.outputs): a = [prefix+"cl_mem"] b = ["const size_t"] - c = ["const size_t"] if (name not in NO_LD_INC) else [] + c = ["const size_t"] if (name not in self.BuffersWithoutLdInc()) else [] return [", ".join(a+b+c)] return [] @@ -252,57 +257,63 @@ class Routine(): # Retrieves a combination of all the argument names, with Claduc casts def ArgumentsCladuc(self, flavour, indent): - return (self.Options() + self.Sizes() + self.BufferCladuc("dot") + + return (self.Options() + self.Sizes() + + list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffers()])) + self.Scalar("alpha") + list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) + self.Scalar("beta") + list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) + - list(chain(*[self.Scalar(s) for s in ["d1","d2","a","b","c","s"]]))) + list(chain(*[self.Scalar(s) for s in ["C","S"]]))) # Retrieves a combination of all the argument names, with CLBlast casts def ArgumentsCast(self, flavour, indent): - return (self.OptionsCast(indent) + self.Sizes() + self.Buffer("dot") + + return (self.OptionsCast(indent) + self.Sizes() + + list(chain(*[self.Buffer(b) for b in self.ScalarBuffers()])) + self.ScalarUse("alpha", flavour) + list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + self.ScalarUse("beta", flavour) + list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) + - list(chain(*[self.ScalarUse(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) + list(chain(*[self.ScalarUse(s, flavour) for s in ["C","S"]]))) # As above, but for the clBLAS wrapper def ArgumentsWrapper(self, flavour): - return (self.Options() + self.Sizes() + self.BufferWrapper("dot") + + return (self.Options() + self.Sizes() + + list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffers()])) + self.ScalarUseWrapper("alpha", flavour) + list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) + self.ScalarUseWrapper("beta", flavour) + list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) + - list(chain(*[self.ScalarUseWrapper(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) + list(chain(*[self.ScalarUseWrapper(s, flavour) for s in ["C","S"]]))) # Retrieves a combination of all the argument definitions def ArgumentsDef(self, flavour): - return (self.OptionsDef() + self.SizesDef() + self.BufferDef("dot") + + return (self.OptionsDef() + self.SizesDef() + + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffers()])) + self.ScalarDef("alpha", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + self.ScalarDef("beta", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + - list(chain(*[self.ScalarDef(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) + list(chain(*[self.ScalarDef(s, flavour) for s in ["C","S"]]))) # As above, but clBLAS wrapper plain datatypes def ArgumentsDefWrapper(self, flavour): - return (self.OptionsDefWrapper() + self.SizesDef() + self.BufferDef("dot") + + return (self.OptionsDefWrapper() + self.SizesDef() + + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffers()])) + self.ScalarDefPlain("alpha", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + self.ScalarDefPlain("beta", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + - list(chain(*[self.ScalarDefPlain(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) + list(chain(*[self.ScalarDefPlain(s, flavour) for s in ["C","S"]]))) # Retrieves a combination of all the argument types def ArgumentsType(self, flavour): - return (self.OptionsType() + self.SizesType() + self.BufferType("dot") + + return (self.OptionsType() + self.SizesType() + + list(chain(*[self.BufferType(b) for b in self.ScalarBuffers()])) + self.ScalarType("alpha", flavour) + list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) + self.ScalarType("beta", flavour) + list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) + - list(chain(*[self.ScalarType(s, flavour) for s in ["d1","d2","a","b","c","s"]]))) + list(chain(*[self.ScalarType(s, flavour) for s in ["C","S"]]))) # ============================================================================================== diff --git a/src/clblast.cc b/src/clblast.cc index 0b8de40a..8f7abfd6 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -67,6 +67,49 @@ namespace clblast { // BLAS level-1 (vector-vector) routines // ================================================================================================= +// Generate plane rotation: SROTG/DROTG +template +StatusCode Rotg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Apply plane rotation: SROT/DROT +template +StatusCode Rot(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + const T, + const T, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rot(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + const float, + const float, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Rot(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + const double, + const double, + cl_command_queue*, cl_event*); + // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP template StatusCode Swap(const size_t n, diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 66d16f6d..d36b2695 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -25,6 +25,62 @@ using double2 = clblast::double2; // BLAS level-1 (vector-vector) routines // ================================================================================================= +// ROTG +StatusCode CLBlastSrotg(cl_mem SA_buffer, const size_t SA_offset, + cl_mem SB_buffer, const size_t SB_offset, + cl_mem C_buffer, const size_t C_offset, + cl_mem S_buffer, const size_t S_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotg(SA_buffer, SA_offset, + SB_buffer, SB_offset, + C_buffer, C_offset, + S_buffer, S_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDrotg(cl_mem SA_buffer, const size_t SA_offset, + cl_mem SB_buffer, const size_t SB_offset, + cl_mem C_buffer, const size_t C_offset, + cl_mem S_buffer, const size_t S_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotg(SA_buffer, SA_offset, + SB_buffer, SB_offset, + C_buffer, C_offset, + S_buffer, S_offset, + queue, event); + return static_cast(status); +} + +// ROT +StatusCode CLBlastSrot(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const float C, + const float S, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rot(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + C, + S, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDrot(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const double C, + const double S, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rot(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + C, + S, + queue, event); + return static_cast(status); +} + // SWAP StatusCode CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, diff --git a/test/correctness/routines/level1/xrot.cc b/test/correctness/routines/level1/xrot.cc new file mode 100644 index 00000000..4020ff13 --- /dev/null +++ b/test/correctness/routines/level1/xrot.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xrot.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SROT"); + clblast::RunTests, double, double>(argc, argv, true, "DROT"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xrotg.cc b/test/correctness/routines/level1/xrotg.cc new file mode 100644 index 00000000..dd068992 --- /dev/null +++ b/test/correctness/routines/level1/xrotg.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xrotg.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SROTG"); + clblast::RunTests, double, double>(argc, argv, true, "DROTG"); + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xrot.cc b/test/performance/routines/level1/xrot.cc new file mode 100644 index 00000000..3ff59ace --- /dev/null +++ b/test/performance/routines/level1/xrot.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xrot.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xrotg.cc b/test/performance/routines/level1/xrotg.cc new file mode 100644 index 00000000..0320c314 --- /dev/null +++ b/test/performance/routines/level1/xrotg.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xrotg.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 23a02a45..553e3e66 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -25,6 +25,71 @@ namespace clblast { // BLAS level-1 (vector-vector) routines // ================================================================================================= +// Forwards the clBLAS calls for SROTG/DROTG +template +clblasStatus clblasXrotg(cl_mem SA_buffer, const size_t SA_offset, + cl_mem SB_buffer, const size_t SB_offset, + cl_mem C_buffer, const size_t C_offset, + cl_mem S_buffer, const size_t S_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXrotg(cl_mem SA_buffer, const size_t SA_offset, + cl_mem SB_buffer, const size_t SB_offset, + cl_mem C_buffer, const size_t C_offset, + cl_mem S_buffer, const size_t S_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSrotg(SA_buffer, SA_offset, + SB_buffer, SB_offset, + C_buffer, C_offset, + S_buffer, S_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXrotg(cl_mem SA_buffer, const size_t SA_offset, + cl_mem SB_buffer, const size_t SB_offset, + cl_mem C_buffer, const size_t C_offset, + cl_mem S_buffer, const size_t S_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDrotg(SA_buffer, SA_offset, + SB_buffer, SB_offset, + C_buffer, C_offset, + S_buffer, S_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + +// Forwards the clBLAS calls for SROT/DROT +clblasStatus clblasXrot(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const float C, + const float S, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSrot(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + C, + S, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXrot(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const double C, + const double S, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDrot(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + C, + S, + num_queues, queues, num_wait_events, wait_events, events); +} + // Forwards the clBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP template clblasStatus clblasXswap(const size_t n, From c1df78676471a8a26ea8ec5a092734566d490db4 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 30 Mar 2016 17:32:13 +0200 Subject: [PATCH 13/60] Added prototypes for the xROTM and xROTMG routines --- include/clblast.h | 33 ++++-- include/clblast_c.h | 54 ++++++--- scripts/generator/generator.py | 6 +- scripts/generator/routine.py | 42 ++++--- src/clblast.cc | 47 +++++++- src/clblast_c.cc | 104 ++++++++++++---- test/correctness/routines/level1/xrotm.cc | 26 ++++ test/correctness/routines/level1/xrotmg.cc | 26 ++++ test/performance/routines/level1/xrotm.cc | 33 ++++++ test/performance/routines/level1/xrotmg.cc | 33 ++++++ test/wrapper_clblas.h | 131 ++++++++++++++++----- 11 files changed, 442 insertions(+), 93 deletions(-) create mode 100644 test/correctness/routines/level1/xrotm.cc create mode 100644 test/correctness/routines/level1/xrotmg.cc create mode 100644 test/performance/routines/level1/xrotm.cc create mode 100644 test/performance/routines/level1/xrotmg.cc diff --git a/include/clblast.h b/include/clblast.h index a5fd30f8..ac16188f 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -87,23 +87,40 @@ enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, // BLAS level-1 (vector-vector) routines // ================================================================================================= -// Generate plane rotation: SROTG/DROTG +// Generate givens plane rotation: SROTG/DROTG template -StatusCode Rotg(cl_mem SA_buffer, const size_t SA_offset, - cl_mem SB_buffer, const size_t SB_offset, - cl_mem C_buffer, const size_t C_offset, - cl_mem S_buffer, const size_t S_offset, +StatusCode Rotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event = nullptr); -// Apply plane rotation: SROT/DROT +// Generate modified givens plane rotation: SROTMG/DROTMG +template +StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event = nullptr); + +// Apply givens plane rotation: SROT/DROT template StatusCode Rot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const T C, - const T S, + const T cos, + const T sin, cl_command_queue* queue, cl_event* event = nullptr); +// Apply modified givens plane rotation: SROTM/DROTM +template +StatusCode Rotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event = nullptr); + // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP template StatusCode Swap(const size_t n, diff --git a/include/clblast_c.h b/include/clblast_c.h index be5bab57..a5563951 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -96,32 +96,58 @@ typedef enum Precision_ { kHalf = 16, kSingle = 32, kDouble = 64, // BLAS level-1 (vector-vector) routines // ================================================================================================= -// Generate plane rotation: SROTG/DROTG -StatusCode PUBLIC_API CLBlastSrotg(cl_mem SA_buffer, const size_t SA_offset, - cl_mem SB_buffer, const size_t SB_offset, - cl_mem C_buffer, const size_t C_offset, - cl_mem S_buffer, const size_t S_offset, +// Generate givens plane rotation: SROTG/DROTG +StatusCode PUBLIC_API CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event); -StatusCode PUBLIC_API CLBlastDrotg(cl_mem SA_buffer, const size_t SA_offset, - cl_mem SB_buffer, const size_t SB_offset, - cl_mem C_buffer, const size_t C_offset, - cl_mem S_buffer, const size_t S_offset, +StatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event); -// Apply plane rotation: SROT/DROT +// Generate modified givens plane rotation: SROTMG/DROTMG +StatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event); + +// Apply givens plane rotation: SROT/DROT StatusCode PUBLIC_API CLBlastSrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const float C, - const float S, + const float cos, + const float sin, cl_command_queue* queue, cl_event* event); StatusCode PUBLIC_API CLBlastDrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const double C, - const double S, + const double cos, + const double sin, cl_command_queue* queue, cl_event* event); +// Apply modified givens plane rotation: SROTM/DROTM +StatusCode PUBLIC_API CLBlastSrotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDrotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event); + // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP StatusCode PUBLIC_API CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 7191cba1..1eada753 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -52,8 +52,10 @@ TU = DataType("typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for # Populates a list of routines routines = [ [ # Level 1: vector-vector - Routine(False, "1", "rotg", T, [S,D], [], [], [], ["SA","SB","C","S"], [], False, "Generate plane rotation"), - Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["C","S"], False, "Apply plane rotation"), + Routine(False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], False, "Generate givens plane rotation"), + Routine(False, "1", "rotmg", T, [S,D], [], [], [], ["sd1","sd2","sx1","sy1","sparam"], [], False, "Generate modified givens plane rotation"), + Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], False, "Apply givens plane rotation"), + Routine(False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], False, "Apply modified givens plane rotation"), Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"), Routine(True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"), Routine(True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"), diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index d74def25..0a61490b 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -59,12 +59,18 @@ class Routine(): self.description = description # List of scalar buffers - def ScalarBuffers(self): - return ["SA","SB","C","S","dot"] + def ScalarBuffersFirst(self): + return ["dot"] + def ScalarBuffersSecond(self): + return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] + + # List of scalars other than alpha and beta + def OtherScalars(self): + return ["cos","sin"] # List of buffers without 'inc' or 'ld' def BuffersWithoutLdInc(self): - return self.ScalarBuffers() + ["ap"] + return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"] # Retrieves the number of characters in the routine's name def Length(self): @@ -258,62 +264,68 @@ class Routine(): # Retrieves a combination of all the argument names, with Claduc casts def ArgumentsCladuc(self, flavour, indent): return (self.Options() + self.Sizes() + - list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffers()])) + + list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersFirst()])) + self.Scalar("alpha") + list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) + self.Scalar("beta") + list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) + - list(chain(*[self.Scalar(s) for s in ["C","S"]]))) + list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.Scalar(s) for s in self.OtherScalars()]))) # Retrieves a combination of all the argument names, with CLBlast casts def ArgumentsCast(self, flavour, indent): return (self.OptionsCast(indent) + self.Sizes() + - list(chain(*[self.Buffer(b) for b in self.ScalarBuffers()])) + + list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) + self.ScalarUse("alpha", flavour) + list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) + self.ScalarUse("beta", flavour) + list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) + - list(chain(*[self.ScalarUse(s, flavour) for s in ["C","S"]]))) + list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()]))) # As above, but for the clBLAS wrapper def ArgumentsWrapper(self, flavour): return (self.Options() + self.Sizes() + - list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffers()])) + + list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersFirst()])) + self.ScalarUseWrapper("alpha", flavour) + list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) + self.ScalarUseWrapper("beta", flavour) + list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) + - list(chain(*[self.ScalarUseWrapper(s, flavour) for s in ["C","S"]]))) + list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()]))) # Retrieves a combination of all the argument definitions def ArgumentsDef(self, flavour): return (self.OptionsDef() + self.SizesDef() + - list(chain(*[self.BufferDef(b) for b in self.ScalarBuffers()])) + + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) + self.ScalarDef("alpha", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + self.ScalarDef("beta", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + - list(chain(*[self.ScalarDef(s, flavour) for s in ["C","S"]]))) + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()]))) # As above, but clBLAS wrapper plain datatypes def ArgumentsDefWrapper(self, flavour): return (self.OptionsDefWrapper() + self.SizesDef() + - list(chain(*[self.BufferDef(b) for b in self.ScalarBuffers()])) + + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) + self.ScalarDefPlain("alpha", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + self.ScalarDefPlain("beta", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + - list(chain(*[self.ScalarDefPlain(s, flavour) for s in ["C","S"]]))) + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) # Retrieves a combination of all the argument types def ArgumentsType(self, flavour): return (self.OptionsType() + self.SizesType() + - list(chain(*[self.BufferType(b) for b in self.ScalarBuffers()])) + + list(chain(*[self.BufferType(b) for b in self.ScalarBuffersFirst()])) + self.ScalarType("alpha", flavour) + list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) + self.ScalarType("beta", flavour) + list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) + - list(chain(*[self.ScalarType(s, flavour) for s in ["C","S"]]))) + list(chain(*[self.BufferType(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarType(s, flavour) for s in self.OtherScalars()]))) # ============================================================================================== diff --git a/src/clblast.cc b/src/clblast.cc index 8f7abfd6..1b2c3a12 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -67,7 +67,7 @@ namespace clblast { // BLAS level-1 (vector-vector) routines // ================================================================================================= -// Generate plane rotation: SROTG/DROTG +// Generate givens plane rotation: SROTG/DROTG template StatusCode Rotg(cl_mem, const size_t, cl_mem, const size_t, @@ -87,7 +87,30 @@ template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); -// Apply plane rotation: SROT/DROT +// Generate modified givens plane rotation: SROTMG/DROTMG +template +StatusCode Rotmg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + +// Apply givens plane rotation: SROT/DROT template StatusCode Rot(const size_t, cl_mem, const size_t, const size_t, @@ -110,6 +133,26 @@ template StatusCode PUBLIC_API Rot(const size_t, const double, cl_command_queue*, cl_event*); +// Apply modified givens plane rotation: SROTM/DROTM +template +StatusCode Rotm(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Rotm(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Rotm(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP template StatusCode Swap(const size_t n, diff --git a/src/clblast_c.cc b/src/clblast_c.cc index d36b2695..b530732c 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -26,61 +26,117 @@ using double2 = clblast::double2; // ================================================================================================= // ROTG -StatusCode CLBlastSrotg(cl_mem SA_buffer, const size_t SA_offset, - cl_mem SB_buffer, const size_t SB_offset, - cl_mem C_buffer, const size_t C_offset, - cl_mem S_buffer, const size_t S_offset, +StatusCode CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotg(SA_buffer, SA_offset, - SB_buffer, SB_offset, - C_buffer, C_offset, - S_buffer, S_offset, + auto status = clblast::Rotg(sa_buffer, sa_offset, + sb_buffer, sb_offset, + sc_buffer, sc_offset, + ss_buffer, ss_offset, queue, event); return static_cast(status); } -StatusCode CLBlastDrotg(cl_mem SA_buffer, const size_t SA_offset, - cl_mem SB_buffer, const size_t SB_offset, - cl_mem C_buffer, const size_t C_offset, - cl_mem S_buffer, const size_t S_offset, +StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, cl_command_queue* queue, cl_event* event) { - auto status = clblast::Rotg(SA_buffer, SA_offset, - SB_buffer, SB_offset, - C_buffer, C_offset, - S_buffer, S_offset, + auto status = clblast::Rotg(sa_buffer, sa_offset, + sb_buffer, sb_offset, + sc_buffer, sc_offset, + ss_buffer, ss_offset, queue, event); return static_cast(status); } +// ROTMG +StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotmg(sd1_buffer, sd1_offset, + sd2_buffer, sd2_offset, + sx1_buffer, sx1_offset, + sy1_buffer, sy1_offset, + sparam_buffer, sparam_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotmg(sd1_buffer, sd1_offset, + sd2_buffer, sd2_offset, + sx1_buffer, sx1_offset, + sy1_buffer, sy1_offset, + sparam_buffer, sparam_offset, + queue, event); + return static_cast(status); +} + // ROT StatusCode CLBlastSrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const float C, - const float S, + const float cos, + const float sin, cl_command_queue* queue, cl_event* event) { auto status = clblast::Rot(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, - C, - S, + cos, + sin, queue, event); return static_cast(status); } StatusCode CLBlastDrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const double C, - const double S, + const double cos, + const double sin, cl_command_queue* queue, cl_event* event) { auto status = clblast::Rot(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, - C, - S, + cos, + sin, queue, event); return static_cast(status); } +// ROTM +StatusCode CLBlastSrotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotm(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + sparam_buffer, sparam_offset, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDrotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Rotm(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + sparam_buffer, sparam_offset, + queue, event); + return static_cast(status); +} + // SWAP StatusCode CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, diff --git a/test/correctness/routines/level1/xrotm.cc b/test/correctness/routines/level1/xrotm.cc new file mode 100644 index 00000000..869056ef --- /dev/null +++ b/test/correctness/routines/level1/xrotm.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xrotm.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SROTM"); + clblast::RunTests, double, double>(argc, argv, true, "DROTM"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/level1/xrotmg.cc b/test/correctness/routines/level1/xrotmg.cc new file mode 100644 index 00000000..29f8b0e1 --- /dev/null +++ b/test/correctness/routines/level1/xrotmg.cc @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xrotmg.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SROTMG"); + clblast::RunTests, double, double>(argc, argv, true, "DROTMG"); + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xrotm.cc b/test/performance/routines/level1/xrotm.cc new file mode 100644 index 00000000..7af94d0f --- /dev/null +++ b/test/performance/routines/level1/xrotm.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xrotm.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xrotmg.cc b/test/performance/routines/level1/xrotmg.cc new file mode 100644 index 00000000..a326347b --- /dev/null +++ b/test/performance/routines/level1/xrotmg.cc @@ -0,0 +1,33 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xrotmg.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + } + return 0; +} + +// ================================================================================================= diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 553e3e66..259aa27c 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -27,69 +27,144 @@ namespace clblast { // Forwards the clBLAS calls for SROTG/DROTG template -clblasStatus clblasXrotg(cl_mem SA_buffer, const size_t SA_offset, - cl_mem SB_buffer, const size_t SB_offset, - cl_mem C_buffer, const size_t C_offset, - cl_mem S_buffer, const size_t S_offset, +clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> -clblasStatus clblasXrotg(cl_mem SA_buffer, const size_t SA_offset, - cl_mem SB_buffer, const size_t SB_offset, - cl_mem C_buffer, const size_t C_offset, - cl_mem S_buffer, const size_t S_offset, +clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSrotg(SA_buffer, SA_offset, - SB_buffer, SB_offset, - C_buffer, C_offset, - S_buffer, S_offset, + return clblasSrotg(sa_buffer, sa_offset, + sb_buffer, sb_offset, + sc_buffer, sc_offset, + ss_buffer, ss_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> -clblasStatus clblasXrotg(cl_mem SA_buffer, const size_t SA_offset, - cl_mem SB_buffer, const size_t SB_offset, - cl_mem C_buffer, const size_t C_offset, - cl_mem S_buffer, const size_t S_offset, +clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset, + cl_mem sb_buffer, const size_t sb_offset, + cl_mem sc_buffer, const size_t sc_offset, + cl_mem ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDrotg(SA_buffer, SA_offset, - SB_buffer, SB_offset, - C_buffer, C_offset, - S_buffer, S_offset, + return clblasDrotg(sa_buffer, sa_offset, + sb_buffer, sb_offset, + sc_buffer, sc_offset, + ss_buffer, ss_offset, num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for SROTMG/DROTMG +template +clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSrotmg(sd1_buffer, sd1_offset, + sd2_buffer, sd2_offset, + sx1_buffer, sx1_offset, + sy1_buffer, sy1_offset, + sparam_buffer, sparam_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, + cl_mem sd2_buffer, const size_t sd2_offset, + cl_mem sx1_buffer, const size_t sx1_offset, + cl_mem sy1_buffer, const size_t sy1_offset, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDrotmg(sd1_buffer, sd1_offset, + sd2_buffer, sd2_offset, + sx1_buffer, sx1_offset, + sy1_buffer, sy1_offset, + sparam_buffer, sparam_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + // Forwards the clBLAS calls for SROT/DROT clblasStatus clblasXrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const float C, - const float S, + const float cos, + const float sin, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrot(n, x_buffer, x_offset, static_cast(x_inc), y_buffer, y_offset, static_cast(y_inc), - C, - S, + cos, + sin, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXrot(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - const double C, - const double S, + const double cos, + const double sin, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrot(n, x_buffer, x_offset, static_cast(x_inc), y_buffer, y_offset, static_cast(y_inc), - C, - S, + cos, + sin, num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for SROTM/DROTM +template +clblasStatus clblasXrotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXrotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSrotm(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + sparam_buffer, sparam_offset, + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXrotm(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem sparam_buffer, const size_t sparam_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDrotm(n, + x_buffer, x_offset, static_cast(x_inc), + y_buffer, y_offset, static_cast(y_inc), + sparam_buffer, sparam_offset, + num_queues, queues, num_wait_events, wait_events, events); +} + // Forwards the clBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP template clblasStatus clblasXswap(const size_t n, From 6578102ae996ce0aa52b45704f38c1cd5a10d3c0 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Wed, 30 Mar 2016 16:24:38 -0700 Subject: [PATCH 14/60] CMake now downloads the cl.hpp header from the Khronos website when building the samples --- .gitignore | 3 ++- CMakeLists.txt | 3 +++ samples/sgemm.cc | 6 +++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 6bc958fc..bcb32754 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ build stash .* *.pyc -*.db \ No newline at end of file +*.db +cl.hpp \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 65debdf4..d3ad6889 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -164,6 +164,9 @@ endif() # This section contains all the code related to the examples if(SAMPLES) + # Downloads the cl.hpp file from Khronos + file(DOWNLOAD https://www.khronos.org/registry/cl/api/1.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp) + # Adds sample programs (C++) foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP}) add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc) diff --git a/samples/sgemm.cc b/samples/sgemm.cc index 8f33b6ad..785b051c 100644 --- a/samples/sgemm.cc +++ b/samples/sgemm.cc @@ -8,8 +8,8 @@ // Cedric Nugteren // // This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does -// requires the Khronos C++ OpenCL API header file (not included). The example uses C++ features, -// but CLBlast can also be used using the regular C-style OpenCL API. +// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++ +// features, but CLBlast can also be used using the regular C-style OpenCL API. // // Note that this example is meant for illustration purposes only. CLBlast provides other programs // for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). @@ -22,7 +22,7 @@ // Includes the C++ OpenCL API. If not yet available, it can be found here: // https://www.khronos.org/registry/cl/api/1.1/cl.hpp -#include +#include "cl.hpp" // Includes the CLBlast library #include From 5409f349a17f60ba68133fd0cc9789fb2918f790 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Wed, 30 Mar 2016 21:32:04 -0700 Subject: [PATCH 15/60] Fixed the nrm2 kernel for complex data-types --- src/kernels/level1/xnrm2.opencl | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl index c50d7d63..cf579457 100644 --- a/src/kernels/level1/xnrm2.opencl +++ b/src/kernels/level1/xnrm2.opencl @@ -70,21 +70,6 @@ __kernel void Xnrm2(const int n, // ================================================================================================= -// Computes the square root -inline real SquareRoot(const real z) { - #if PRECISION == 3232 || PRECISION == 6464 - double r = sqrt(z.x * z.x + z.y * z.y); - real zpr; zpr.x = z.x + r; zpr.y = z.y; - double zprabs = sqrt(zpr.x * zpr.x + zpr.y + zpr.y); - real result; - result.x = sqrt(r) * zpr.x / zprabs; - result.y = sqrt(r) * zpr.y / zprabs; - return result; - #else - return sqrt(z); - #endif -} - // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to // be launched with a single workgroup only. __attribute__((reqd_work_group_size(WGS2, 1, 1))) @@ -108,7 +93,11 @@ __kernel void Xnrm2Epilogue(const __global real* restrict input, // Computes the square root and stores the final result if (lid == 0) { - nrm2[nrm2_offset] = SquareRoot(lm[0]); + #if PRECISION == 3232 || PRECISION == 6464 + nrm2[nrm2_offset].x = sqrt(lm[0].x); // the result is a non-complex number + #else + nrm2[nrm2_offset] = sqrt(lm[0]); + #endif } } From 8217b017028412594f663a66187f99c3ee0878c9 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Thu, 31 Mar 2016 20:20:32 -0700 Subject: [PATCH 16/60] Updated the documentation --- CHANGELOG | 3 +++ README.md | 9 ++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index f45a35dd..c52e041d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,9 @@ Development version (next release) - Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) - Made the library thread-safe +- Fixed the use of events within the library +- Added level-1 routines: + * SNRM2/DNRM2/ScNRM2/DzNRM2 Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 diff --git a/README.md b/README.md index d3d77c34..ac614026 100644 --- a/README.md +++ b/README.md @@ -170,14 +170,14 @@ CLBlast is in active development but already supports almost all the BLAS routin | xROT | | | - | - | | | xROTM | | | - | - | | | xSWAP | ✔ | ✔ | ✔ | ✔ | | -| xSCAL | ✔ | ✔ | ✔ | ✔ | +CS +ZD | +| xSCAL | ✔ | ✔ | ✔ | ✔ | | | xCOPY | ✔ | ✔ | ✔ | ✔ | | | xAXPY | ✔ | ✔ | ✔ | ✔ | | | xDOT | ✔ | ✔ | - | - | | | xDOTU | - | - | ✔ | ✔ | | | xDOTC | - | - | ✔ | ✔ | | -| xNRM2 | | | - | - | +SC +DZ | -| xASUM | | | - | - | +SC +DZ | +| xNRM2 | ✔ | ✔ | ✔ | ✔ | | +| xASUM | | | | | | | IxAMAX | | | | | | | Level-2 | S | D | C | Z | Notes | @@ -234,7 +234,7 @@ Tuning and testing on a variety of OpenCL devices was made possible by: * [TU/e ES research group](http://www.es.ele.tue.nl/) * [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/) -* [Dividiti](http://www.dividiti.com) +* [dividiti](http://www.dividiti.com) * [SURFsara HPC center](http://www.surfsara.com) Support us @@ -250,4 +250,3 @@ To-do list before release of version 1.0 - Allow the user control over events and synchronization - Add half-precision routines (e.g. HGEMM) - Enable correctness and performance testing against a CPU-based BLAS library -- Test in multi-threaded environments From a2056f2216526989f423a74e4bcd016dac9424f4 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Thu, 31 Mar 2016 22:22:29 -0700 Subject: [PATCH 17/60] Create a first version of CPU BLAS detection in CMake --- CMakeLists.txt | 32 +++++++++++----- cmake/Modules/FindCBLAS.cmake | 70 +++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 9 deletions(-) create mode 100644 cmake/Modules/FindCBLAS.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 8316a49a..48aaefe9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,7 +66,7 @@ else () set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable") endif() elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") - set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") + set(FLAGS "${FLAGS} -Wall -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch") set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn") endif() @@ -98,11 +98,13 @@ if(TUNERS) endif() endif() -# Locates the clBLAS library in case the tests need to be compiled. "FindclBLAS.cmake" is included. +# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake" +# and "FindCBLAS.cmake" are included. if(TESTS) find_package(clBLAS) - if(NOT CLBLAS_FOUND) - message(STATUS "Could NOT find clBLAS, disabling the compilation of the tests") + find_package(CBLAS) + if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND) + message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests") set(TESTS OFF) endif() endif() @@ -215,11 +217,23 @@ endif() # ================================================================================================== # Down from here is all test (performance and correctness) related. Note that these tests require -# the presence of the clBLAS library to act as a reference. +# the presence of clBLAS and/or a BLAS library to act as a reference. if(TESTS) - # Adds new include directories for the reference clBLAS - include_directories(${clblast_SOURCE_DIR}/test ${CLBLAS_INCLUDE_DIRS}) + # Sets the specifics for the reference BLAS libraries + set(REF_INCLUDES ) + set(REF_LIBRARIES ) + if(CLBLAS_FOUND) + set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS}) + set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES}) + endif() + if(CBLAS_FOUND) + set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS}) + set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES}) + endif() + + # Sets the include directories + include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES}) # Creates the common correctness-tests objects (requires CMake 2.8.8) add_library(test_correctness_common OBJECT @@ -239,7 +253,7 @@ if(TESTS) test/correctness/routines/level3/${ROUTINE}.cc) endforeach() foreach(ROUTINE ${ROUTINES}) - target_link_libraries(clblast_test_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) install(TARGETS clblast_test_${ROUTINE} DESTINATION bin) endforeach() @@ -269,7 +283,7 @@ if(TESTS) test/performance/routines/level3/${ROUTINE}.cc) endforeach() foreach(ROUTINE ${ROUTINES}) - target_link_libraries(clblast_client_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) install(TARGETS clblast_client_${ROUTINE} DESTINATION bin) endforeach() diff --git a/cmake/Modules/FindCBLAS.cmake b/cmake/Modules/FindCBLAS.cmake new file mode 100644 index 00000000..16dce243 --- /dev/null +++ b/cmake/Modules/FindCBLAS.cmake @@ -0,0 +1,70 @@ + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +# width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# ================================================================================================== +# +# Defines the following variables: +# CBLAS_FOUND Boolean holding whether or not the Netlib BLAS library was found +# CBLAS_INCLUDE_DIRS The Netlib BLAS include directory +# CBLAS_LIBRARIES The Netlib BLAS library +# +# In case BLAS is not installed in the default directory, set the CBLAS_ROOT variable to point to +# the root of BLAS, such that 'cblas.h' can be found in $CBLAS_ROOT/include. This can either be +# done using an environmental variable (e.g. export CBLAS_ROOT=/path/to/BLAS) or using a CMake +# variable (e.g. cmake -DCBLAS_ROOT=/path/to/BLAS ..). +# +# ================================================================================================== + +# Sets the possible install locations +set(CBLAS_HINTS + ${CBLAS_ROOT} + $ENV{CBLAS_ROOT} +) +set(CBLAS_PATHS + /usr + /usr/local + /usr/local/opt + /System/Library/Frameworks +) + +# Finds the include directories +find_path(CBLAS_INCLUDE_DIRS + NAMES cblas.h + HINTS ${CBLAS_HINTS} + PATH_SUFFIXES include inc include/x86_64 include/x64 openblas/include + PATHS ${CBLAS_PATHS} + DOC "Netlib BLAS include header cblas.h" +) +mark_as_advanced(CBLAS_INCLUDE_DIRS) + +# Finds the library +find_library(CBLAS_LIBRARIES + NAMES blas openblas atlas mkl accelerate + HINTS ${CBLAS_HINTS} + PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import openblas/lib + PATHS ${CBLAS_PATHS} + DOC "Netlib BLAS library" +) +mark_as_advanced(CBLAS_LIBRARIES) + +# ================================================================================================== + +# Notification messages +if(NOT CBLAS_INCLUDE_DIRS) + message(STATUS "Could NOT find 'cblas.h', install a CPU Netlib BLAS or set CBLAS_ROOT") +endif() +if(NOT CBLAS_LIBRARIES) + message(STATUS "Could NOT find a CPU Netlib BLAS library, install it or set CBLAS_ROOT") +endif() + +# Determines whether or not BLAS was found +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES) + +# ================================================================================================== From 5c83217cf256984573924e8f89c46f393a5fcfcd Mon Sep 17 00:00:00 2001 From: cnugteren Date: Fri, 1 Apr 2016 22:36:39 -0700 Subject: [PATCH 18/60] Added a wrapper for CBLAS libraries for performance/correctness testing --- include/clblast.h | 2 +- include/clblast_c.h | 4 +- scripts/generator/datatype.py | 5 + scripts/generator/generator.py | 53 +- scripts/generator/routine.py | 109 ++- src/clblast.cc | 6 +- src/clblast_c.cc | 4 +- test/wrapper_cblas.h | 1667 ++++++++++++++++++++++++++++++++ test/wrapper_clblas.h | 6 +- 9 files changed, 1825 insertions(+), 31 deletions(-) create mode 100644 test/wrapper_cblas.h diff --git a/include/clblast.h b/include/clblast.h index 5e5c5a46..431f2510 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -100,7 +100,7 @@ template StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event = nullptr); diff --git a/include/clblast_c.h b/include/clblast_c.h index dcb3ae3a..f72cff3a 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -112,13 +112,13 @@ StatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, StatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); StatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py index 9323bc4d..5a58ab53 100644 --- a/scripts/generator/datatype.py +++ b/scripts/generator/datatype.py @@ -58,5 +58,10 @@ class DataType(): return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp + # Current scalar is complex + def IsComplex(self, scalar): + return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or + (scalar == "beta" and self.beta_cpp in [FLT2, DBL2])) + # ================================================================================================== diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 6e2b2ed2..36a9bf40 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -8,12 +8,13 @@ # Cedric Nugteren # # This script automatically generates the bodies of the following files, creating the full CLBlast -# API interface and implementation (C, C++, and clBLAS wrapper): +# API interface and implementation (C, C++, and reference BLAS wrappers): # clblast.h # clblast.cc # clblast_c.h # clblast_c.cc # wrapper_clblas.h +# wrapper_cblas.h # It also generates the main functions for the correctness and performance tests as found in # test/correctness/routines/levelX/xYYYY.cc # test/performance/routines/levelX/xYYYY.cc @@ -55,7 +56,7 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") routines = [ [ # Level 1: vector-vector Routine(False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"), - Routine(False, "1", "rotmg", T, [S,D], [], [], [], ["sd1","sd2","sx1","sy1","sparam"], [], "", "Generate modified givens plane rotation"), + Routine(False, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation"), Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"), Routine(False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"), Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"), @@ -220,11 +221,11 @@ def wrapper_clblas(routines): for routine in routines: result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames()) if routine.NoScalars(): - result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n" + result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n" for flavour in routine.flavours: indent = " "*(17 + routine.Length()) - result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n" - arguments = routine.ArgumentsWrapper(flavour) + result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n" + arguments = routine.ArgumentsWrapperCL(flavour) if routine.scratch: result += " auto queue = Queue(queues[0]);\n" result += " auto context = queue.GetContext();\n" @@ -236,6 +237,41 @@ def wrapper_clblas(routines): result += "\n}\n" return result +# The wrapper to the reference CBLAS routines (for performance/correctness testing) +def wrapper_cblas(routines): + result = "" + for routine in routines: + result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames()) + for flavour in routine.flavours: + indent = " "*(10 + routine.Length()) + result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n" + arguments = routine.ArgumentsWrapperC(flavour) + + # Double-precision scalars + for scalar in routine.scalars: + if flavour.IsComplex(scalar): + result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n" + + # Special case for scalar outputs + assignment = "" + postfix = "" + extra_argument = "" + for output_buffer in routine.outputs: + if output_buffer in routine.ScalarBuffersFirst(): + if flavour in [C,Z]: + postfix += "_sub" + indent += " " + extra_argument += ",\n"+indent+"reinterpret_cast(&"+output_buffer+"_buffer["+output_buffer+"_offset])" + else: + assignment = output_buffer+"_buffer["+output_buffer+"_offset] = " + indent += " "*len(assignment) + + result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"(" + result += (",\n"+indent).join([a for a in arguments]) + result += extra_argument+");" + result += "\n}\n" + return result + # ================================================================================================== # Checks for the number of command-line arguments @@ -251,9 +287,10 @@ files = [ path_clblast+"/include/clblast_c.h", path_clblast+"/src/clblast_c.cc", path_clblast+"/test/wrapper_clblas.h", + path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 65, 93, 22, 22] -footer_lines = [6, 3, 9, 2, 6] +header_lines = [84, 65, 93, 22, 22, 31] +footer_lines = [6, 3, 9, 2, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise for f in files: @@ -287,6 +324,8 @@ for i in xrange(0,len(files)): body += clblast_c_cc(routines[level-1]) if i == 4: body += wrapper_clblas(routines[level-1]) + if i == 5: + body += wrapper_cblas(routines[level-1]) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 02040583..fffa19f6 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -28,7 +28,7 @@ def OptionToCLBlast(x): }[x] # As above, but for clBLAS data-types -def OptionToWrapper(x): +def OptionToWrapperCL(x): return { 'layout': "clblasOrder", 'a_transpose': "clblasTranspose", @@ -39,6 +39,18 @@ def OptionToWrapper(x): 'diagonal': "clblasDiag", }[x] +# As above, but for CBLAS data-types +def OptionToWrapperC(x): + return { + 'layout': "CBLAS_ORDER", + 'a_transpose': "CBLAS_TRANSPOSE", + 'b_transpose': "CBLAS_TRANSPOSE", + 'ab_transpose': "CBLAS_TRANSPOSE", + 'side': "CBLAS_SIDE", + 'triangle': "CBLAS_UPLO", + 'diagonal': "CBLAS_DIAG", + }[x] + # ================================================================================================== # Class holding routine-specific information (e.g. name, which arguments, which precisions) @@ -119,6 +131,16 @@ class Routine(): return [", ".join(a+b+c)] return [] + # As above but as vectors + def BufferDefVector(self, name, flavour): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"] + b = ["const size_t "+name+"_offset"] + c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] + return [", ".join(a+b+c)] + return [] + # As above but with Claduc buffers def BufferCladuc(self, name): if (name in self.inputs) or (name in self.outputs): @@ -129,7 +151,7 @@ class Routine(): return [] # As above but with a static cast for clBLAS wrapper - def BufferWrapper(self, name): + def BufferWrapperCL(self, name): if (name in self.inputs) or (name in self.outputs): a = [name+"_buffer"] b = [name+"_offset"] @@ -141,6 +163,24 @@ class Routine(): return [", ".join(a+b+c)] return [] + # As above but with a static cast for CBLAS wrapper + def BufferWrapperC(self, name, flavour): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + if name == "sy1": + a = [name+"_buffer["+name+"_offset]"] + elif flavour.precision_name in ["C","Z"]: + a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"] + else: + a = ["&"+name+"_buffer["+name+"_offset]"] + c = [] + if (name in ["x","y"]): + c = ["static_cast("+name+"_"+self.Postfix(name)+")"] + elif (name in ["a","b","c"]): + c = [name+"_"+self.Postfix(name)] + return [", ".join(a+c)] + return [] + # As above, but only data-types def BufferType(self, name): prefix = "const " if (name in self.inputs) else "" @@ -179,6 +219,14 @@ class Routine(): return [name] return [] + # Retrieves the use of a scalar for CBLAS (alpha/beta) + def ScalarUseWrapperC(self, name, flavour): + if name in self.scalars: + if flavour.IsComplex(name): + return [name+"_array.data()"] + return [name] + return [] + # Retrieves the definition of a scalar (alpha/beta) def ScalarDef(self, name, flavour): if name in self.scalars: @@ -246,9 +294,16 @@ class Routine(): return [] # As above, but now using clBLAS data-types - def OptionsDefWrapper(self): + def OptionsDefWrapperCL(self): if self.options: - definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options] + definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options] + return [", ".join(definitions)] + return [] + + # As above, but now using CBLAS data-types + def OptionsDefWrapperC(self): + if self.options: + definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options] return [", ".join(definitions)] return [] @@ -284,16 +339,26 @@ class Routine(): list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()]))) # As above, but for the clBLAS wrapper - def ArgumentsWrapper(self, flavour): + def ArgumentsWrapperCL(self, flavour): return (self.Options() + self.Sizes() + - list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersFirst()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) + self.ScalarUseWrapper("alpha", flavour) + - list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) + self.ScalarUseWrapper("beta", flavour) + - list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) + list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()]))) + # As above, but for the CBLAS wrapper + def ArgumentsWrapperC(self, flavour): + return (self.Options() + self.Sizes() + + self.ScalarUseWrapperC("alpha", flavour) + + list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) + + self.ScalarUseWrapperC("beta", flavour) + + list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()]))) + # Retrieves a combination of all the argument definitions def ArgumentsDef(self, flavour): return (self.OptionsDef() + self.SizesDef() + @@ -306,8 +371,8 @@ class Routine(): list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()]))) # As above, but clBLAS wrapper plain datatypes - def ArgumentsDefWrapper(self, flavour): - return (self.OptionsDefWrapper() + self.SizesDef() + + def ArgumentsDefWrapperCL(self, flavour): + return (self.OptionsDefWrapperCL() + self.SizesDef() + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) + self.ScalarDefPlain("alpha", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + @@ -315,6 +380,17 @@ class Routine(): list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) + list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) + + # As above, but CBLAS wrapper plain datatypes + def ArgumentsDefWrapperC(self, flavour): + return (self.OptionsDefWrapperC() + self.SizesDef() + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) + + self.ScalarDefPlain("alpha", flavour) + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) + + self.ScalarDefPlain("beta", flavour) + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) # Retrieves a combination of all the argument types def ArgumentsType(self, flavour): @@ -356,7 +432,7 @@ class Routine(): return result # As above, but now for the clBLAS wrapper - def RoutineHeaderWrapper(self, flavour, def_only, spaces): + def RoutineHeaderWrapperCL(self, flavour, def_only, spaces): template = "<"+flavour.template+">" if self.NoScalars() and not def_only else "" indent = " "*(spaces + self.Length() + len(template)) result = "" @@ -366,9 +442,16 @@ class Routine(): result += flavour.name result += ">\n" result += "clblasStatus clblasX"+self.name+template+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)]) + result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)]) result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues" result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)" return result + # As above, but now for the CBLAS wrapper + def RoutineHeaderWrapperC(self, flavour, def_only, spaces): + indent = " "*(spaces + self.Length()) + result = "void cblasX"+self.name+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")" + return result + # ================================================================================================== diff --git a/src/clblast.cc b/src/clblast.cc index fc50ffae..75893ee9 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -93,7 +93,7 @@ template StatusCode Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, - cl_mem, const size_t, + const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; @@ -101,13 +101,13 @@ StatusCode Rotmg(cl_mem, const size_t, template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, - cl_mem, const size_t, + const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, - cl_mem, const size_t, + const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 6d10c686..23e97bd5 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -55,7 +55,7 @@ StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { auto status = clblast::Rotmg(sd1_buffer, sd1_offset, @@ -69,7 +69,7 @@ StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { auto status = clblast::Rotmg(sd1_buffer, sd1_offset, diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h new file mode 100644 index 00000000..c690a45c --- /dev/null +++ b/test/wrapper_cblas.h @@ -0,0 +1,1667 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a wrapper around a CPU BLAS library, such that its routines can be called +// in a similar way as the CLBlast routines: using alpha and beta to determine the precision. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_WRAPPER_CBLAS_H_ +#define CLBLAST_TEST_WRAPPER_CBLAS_H_ + +#include + +#include "internal/utilities.h" + +namespace clblast { + +// OpenBLAS is not fully Netlib CBLAS compatible +#ifdef OPENBLAS_VERSION + using return_pointer_float = openblas_complex_float*; + using return_pointer_double = openblas_complex_double*; +#else + using return_pointer_float = void*; + using return_pointer_double = void*; +#endif + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Forwards the Netlib BLAS calls for SROTG/DROTG +void cblasXrotg(std::vector& sa_buffer, const size_t sa_offset, + std::vector& sb_buffer, const size_t sb_offset, + std::vector& sc_buffer, const size_t sc_offset, + std::vector& ss_buffer, const size_t ss_offset) { + cblas_srotg(&sa_buffer[sa_offset], + &sb_buffer[sb_offset], + &sc_buffer[sc_offset], + &ss_buffer[ss_offset]); +} +void cblasXrotg(std::vector& sa_buffer, const size_t sa_offset, + std::vector& sb_buffer, const size_t sb_offset, + std::vector& sc_buffer, const size_t sc_offset, + std::vector& ss_buffer, const size_t ss_offset) { + cblas_drotg(&sa_buffer[sa_offset], + &sb_buffer[sb_offset], + &sc_buffer[sc_offset], + &ss_buffer[ss_offset]); +} + +// Forwards the Netlib BLAS calls for SROTMG/DROTMG +void cblasXrotmg(std::vector& sd1_buffer, const size_t sd1_offset, + std::vector& sd2_buffer, const size_t sd2_offset, + std::vector& sx1_buffer, const size_t sx1_offset, + const std::vector& sy1_buffer, const size_t sy1_offset, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_srotmg(&sd1_buffer[sd1_offset], + &sd2_buffer[sd2_offset], + &sx1_buffer[sx1_offset], + sy1_buffer[sy1_offset], + &sparam_buffer[sparam_offset]); +} +void cblasXrotmg(std::vector& sd1_buffer, const size_t sd1_offset, + std::vector& sd2_buffer, const size_t sd2_offset, + std::vector& sx1_buffer, const size_t sx1_offset, + const std::vector& sy1_buffer, const size_t sy1_offset, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_drotmg(&sd1_buffer[sd1_offset], + &sd2_buffer[sd2_offset], + &sx1_buffer[sx1_offset], + sy1_buffer[sy1_offset], + &sparam_buffer[sparam_offset]); +} + +// Forwards the Netlib BLAS calls for SROT/DROT +void cblasXrot(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + const float cos, + const float sin) { + cblas_srot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + cos, + sin); +} +void cblasXrot(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + const double cos, + const double sin) { + cblas_drot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + cos, + sin); +} + +// Forwards the Netlib BLAS calls for SROTM/DROTM +void cblasXrotm(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_srotm(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &sparam_buffer[sparam_offset]); +} +void cblasXrotm(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_drotm(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &sparam_buffer[sparam_offset]); +} + +// Forwards the Netlib BLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sswap(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dswap(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_cswap(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zswap(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL +void cblasXscal(const size_t n, + const float alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_sscal(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXscal(const size_t n, + const double alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dscal(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXscal(const size_t n, + const float2 alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cscal(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXscal(const size_t n, + const double2 alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zscal(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_scopy(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dcopy(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_ccopy(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zcopy(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY +void cblasXaxpy(const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_saxpy(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXaxpy(const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_daxpy(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXaxpy(const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_caxpy(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXaxpy(const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zaxpy(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SDOT/DDOT +void cblasXdot(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + dot_buffer[dot_offset] = cblas_sdot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXdot(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + dot_buffer[dot_offset] = cblas_ddot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CDOTU/ZDOTU +void cblasXdotu(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_cdotu_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} +void cblasXdotu(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zdotu_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} + +// Forwards the Netlib BLAS calls for CDOTC/ZDOTC +void cblasXdotc(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_cdotc_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} +void cblasXdotc(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zdotc_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} + +// Forwards the Netlib BLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_snrm2(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_dnrm2(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_scnrm2(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_dznrm2(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// Forwards the Netlib BLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sgemv(layout, a_transpose, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dgemv(layout, a_transpose, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_cgemv(layout, a_transpose, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zgemv(layout, a_transpose, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_cgbmv(layout, a_transpose, + m, n, kl, ku, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zgbmv(layout, a_transpose, + m, n, kl, ku, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CHEMV/ZHEMV +void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chemv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhemv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CHBMV/ZHBMV +void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chbmv(layout, triangle, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhbmv(layout, triangle, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CHPMV/ZHPMV +void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chpmv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhpmv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSYMV/DSYMV +void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_ssymv(layout, triangle, + n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dsymv(layout, triangle, + n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSBMV/DSBMV +void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_ssbmv(layout, triangle, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dsbmv(layout, triangle, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSPMV/DSPMV +void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sspmv(layout, triangle, + n, + alpha, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dspmv(layout, triangle, + n, + alpha, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for STRMV/DTRMV/CTRMV/ZTRMV +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_strmv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtrmv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctrmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztrmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STBMV/DTBMV/CTBMV/ZTBMV +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stbmv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtbmv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctbmv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztbmv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STPMV/DTPMV/CTPMV/ZTPMV +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stpmv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtpmv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctpmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztpmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STRSV/DTRSV/CTRSV/ZTRSV +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_strsv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtrsv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctrsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztrsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STBSV/DTBSV/CTBSV/ZTBSV +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stbsv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtbsv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctbsv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztbsv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STPSV/DTPSV/CTPSV/ZTPSV +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stpsv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtpsv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctpsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztpsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for SGER/DGER +void cblasXger(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_sger(layout, + m, n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} +void cblasXger(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_dger(layout, + m, n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} + +// Forwards the Netlib BLAS calls for CGERU/ZGERU +void cblasXgeru(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cgeru(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXgeru(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zgeru(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CGERC/ZGERC +void cblasXgerc(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cgerc(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXgerc(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zgerc(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CHER/ZHER +void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_cher(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_zher(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CHPR/ZHPR +void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_chpr(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} +void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_zhpr(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} + +// Forwards the Netlib BLAS calls for CHER2/ZHER2 +void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cher2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zher2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CHPR2/ZHPR2 +void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_chpr2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} +void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zhpr2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} + +// Forwards the Netlib BLAS calls for SSYR/DSYR +void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_ssyr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &a_buffer[a_offset], a_ld); +} +void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_dsyr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &a_buffer[a_offset], a_ld); +} + +// Forwards the Netlib BLAS calls for SSPR/DSPR +void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_sspr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &ap_buffer[ap_offset]); +} +void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_dspr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &ap_buffer[ap_offset]); +} + +// Forwards the Netlib BLAS calls for SSYR2/DSYR2 +void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_ssyr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} +void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_dsyr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} + +// Forwards the Netlib BLAS calls for SSPR2/DSPR2 +void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_sspr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &ap_buffer[ap_offset]); +} +void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_dspr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &ap_buffer[ap_offset]); +} + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// Forwards the Netlib BLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_sgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_cgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_ssymm(layout, side, triangle, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dsymm(layout, side, triangle, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_csymm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zsymm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for CHEMM/ZHEMM +void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chemm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhemm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_ssyrk(layout, triangle, a_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dsyrk(layout, triangle, a_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_csyrk(layout, triangle, a_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zsyrk(layout, triangle, a_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for CHERK/ZHERK +void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_cherk(layout, triangle, a_transpose, + n, k, + alpha, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_zherk(layout, triangle, a_transpose, + n, k, + alpha, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_ssyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dsyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_csyr2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zsyr2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for CHER2K/ZHER2K +void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cher2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zher2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for STRMM/DTRMM/CTRMM/ZTRMM +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_strmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_dtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ctrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ztrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} + +// Forwards the Netlib BLAS calls for STRSM/DTRSM/CTRSM/ZTRSM +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_strsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_dtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ctrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ztrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_WRAPPER_CBLAS_H_ +#endif diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index fb6e83aa..89b708b8 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -65,7 +65,7 @@ template clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); @@ -73,7 +73,7 @@ template <> clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { @@ -88,7 +88,7 @@ template <> clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { From 1a82861a902e17f15486664b340c50530cce6542 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sat, 2 Apr 2016 11:58:00 -0700 Subject: [PATCH 19/60] Added support for testing (performance and correctness) against a CPU BLAS library --- CMakeLists.txt | 10 ++++++ include/internal/clpp11.h | 14 ++++---- include/internal/utilities.h | 5 +++ scripts/generator/generator.py | 2 +- test/correctness/testblas.cc | 45 ++++++++++++++---------- test/correctness/testblas.h | 47 ++++++++++++++++--------- test/correctness/tester.cc | 16 ++++++--- test/correctness/tester.h | 6 ++-- test/performance/client.cc | 40 ++++++++++++++++----- test/performance/client.h | 33 ++++++++++++++---- test/routines/level1/xaxpy.h | 46 ++++++++++++++++++------- test/routines/level1/xcopy.h | 46 ++++++++++++++++++------- test/routines/level1/xdot.h | 51 ++++++++++++++++++++------- test/routines/level1/xdotc.h | 51 ++++++++++++++++++++------- test/routines/level1/xdotu.h | 51 ++++++++++++++++++++------- test/routines/level1/xnrm2.h | 46 ++++++++++++++++++------- test/routines/level1/xscal.h | 41 ++++++++++++++++------ test/routines/level1/xswap.h | 47 ++++++++++++++++++------- test/routines/level2/xgbmv.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xgemv.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xger.h | 54 +++++++++++++++++++++-------- test/routines/level2/xgerc.h | 54 +++++++++++++++++++++-------- test/routines/level2/xgeru.h | 54 +++++++++++++++++++++-------- test/routines/level2/xhbmv.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xhemv.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xher.h | 52 ++++++++++++++++++++-------- test/routines/level2/xher2.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xhpmv.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xhpr.h | 52 ++++++++++++++++++++-------- test/routines/level2/xhpr2.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xsbmv.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xspmv.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xspr.h | 52 ++++++++++++++++++++-------- test/routines/level2/xspr2.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xsymv.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xsyr.h | 52 ++++++++++++++++++++-------- test/routines/level2/xsyr2.h | 57 ++++++++++++++++++++++-------- test/routines/level2/xtbmv.h | 58 ++++++++++++++++++++++--------- test/routines/level2/xtpmv.h | 58 ++++++++++++++++++++++--------- test/routines/level2/xtrmv.h | 58 ++++++++++++++++++++++--------- test/routines/level3/xgemm.h | 60 +++++++++++++++++++++++--------- test/routines/level3/xhemm.h | 60 +++++++++++++++++++++++--------- test/routines/level3/xher2k.h | 63 +++++++++++++++++++++++++--------- test/routines/level3/xherk.h | 55 +++++++++++++++++++++-------- test/routines/level3/xsymm.h | 60 +++++++++++++++++++++++--------- test/routines/level3/xsyr2k.h | 60 +++++++++++++++++++++++--------- test/routines/level3/xsyrk.h | 55 +++++++++++++++++++++-------- test/routines/level3/xtrmm.h | 61 +++++++++++++++++++++++--------- test/wrapper_cblas.h | 7 ++++ 49 files changed, 1691 insertions(+), 615 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 48aaefe9..21254ded 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -226,10 +226,20 @@ if(TESTS) if(CLBLAS_FOUND) set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS}) set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES}) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + add_definitions(" /DCLBLAST_REF_CLBLAS") + else() + add_definitions(" -DCLBLAST_REF_CLBLAS") + endif() endif() if(CBLAS_FOUND) set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS}) set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES}) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + add_definitions(" /DCLBLAST_REF_CBLAS") + else() + add_definitions(" -DCLBLAST_REF_CBLAS") + endif() endif() # Sets the include directories diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index aac66396..00905ef7 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -465,31 +465,33 @@ class Buffer { } // Copies from device to host: reading the device buffer a-synchronously - void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) { + void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); } CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), host, 0, nullptr, nullptr)); } void ReadAsync(const Queue &queue, const size_t size, std::vector &host, - const size_t offset = 0) { + const size_t offset = 0) const { if (host.size() < size) { Error("target host buffer is too small"); } ReadAsync(queue, size, host.data(), offset); } void ReadAsync(const Queue &queue, const size_t size, BufferHost &host, - const size_t offset = 0) { + const size_t offset = 0) const { if (host.size() < size) { Error("target host buffer is too small"); } ReadAsync(queue, size, host.data(), offset); } // Copies from device to host: reading the device buffer - void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) { + void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { ReadAsync(queue, size, host, offset); queue.Finish(); } - void Read(const Queue &queue, const size_t size, std::vector &host, const size_t offset = 0) { + void Read(const Queue &queue, const size_t size, std::vector &host, + const size_t offset = 0) const { Read(queue, size, host.data(), offset); } - void Read(const Queue &queue, const size_t size, BufferHost &host, const size_t offset = 0) { + void Read(const Queue &queue, const size_t size, BufferHost &host, + const size_t offset = 0) const { Read(queue, size, host.data(), offset); } diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 35f76722..6adc1d0a 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -35,6 +35,9 @@ using double2 = std::complex; const std::string kKhronosHalfPrecision = "cl_khr_fp16"; const std::string kKhronosDoublePrecision = "cl_khr_fp64"; +// Catched an unknown error +constexpr auto kUnknownError = -999; + // ================================================================================================= // The routine-specific arguments in string form @@ -70,6 +73,7 @@ constexpr auto kArgFraction = "fraction"; // The client-specific arguments in string form constexpr auto kArgCompareclblas = "clblas"; +constexpr auto kArgComparecblas = "cblas"; constexpr auto kArgStepSize = "step"; constexpr auto kArgNumSteps = "num_steps"; constexpr auto kArgNumRuns = "runs"; @@ -128,6 +132,7 @@ struct Arguments { double fraction = 1.0; // Client-specific arguments int compare_clblas = 1; + int compare_cblas = 1; size_t step = 1; size_t num_steps = 0; size_t num_runs = 10; diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 36a9bf40..bdf6b9d7 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -289,7 +289,7 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 65, 93, 22, 22, 31] +header_lines = [84, 65, 93, 22, 22, 38] footer_lines = [6, 3, 9, 2, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index 1329b2c5..cc9a5adb 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -79,24 +79,6 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st // Iterates over all the to-be-tested combinations of arguments for (auto &args: test_vector) { - // Runs the reference clBLAS code - auto x_vec1 = Buffer(context_, args.x_size); - auto y_vec1 = Buffer(context_, args.y_size); - auto a_mat1 = Buffer(context_, args.a_size); - auto b_mat1 = Buffer(context_, args.b_size); - auto c_mat1 = Buffer(context_, args.c_size); - auto ap_mat1 = Buffer(context_, args.ap_size); - auto scalar1 = Buffer(context_, args.scalar_size); - x_vec1.Write(queue_, args.x_size, x_source_); - y_vec1.Write(queue_, args.y_size, y_source_); - a_mat1.Write(queue_, args.a_size, a_source_); - b_mat1.Write(queue_, args.b_size, b_source_); - c_mat1.Write(queue_, args.c_size, c_source_); - ap_mat1.Write(queue_, args.ap_size, ap_source_); - scalar1.Write(queue_, args.scalar_size, scalar_source_); - auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; - auto status1 = run_reference_(args, buffers1, queue_); - // Runs the CLBlast code auto x_vec2 = Buffer(context_, args.x_size); auto y_vec2 = Buffer(context_, args.y_size); @@ -115,6 +97,33 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; auto status2 = run_routine_(args, buffers2, queue_); + #ifndef CLBLAST_REF_CLBLAS + // Don't continue with CBLAS if there are incorrect parameters + if (status2 != StatusCode::kSuccess) { + // TODO: Mark this as a skipped test instead of a succesfull test + TestErrorCodes(status2, status2, args); + continue; + } + #endif + + // Runs the reference BLAS code + auto x_vec1 = Buffer(context_, args.x_size); + auto y_vec1 = Buffer(context_, args.y_size); + auto a_mat1 = Buffer(context_, args.a_size); + auto b_mat1 = Buffer(context_, args.b_size); + auto c_mat1 = Buffer(context_, args.c_size); + auto ap_mat1 = Buffer(context_, args.ap_size); + auto scalar1 = Buffer(context_, args.scalar_size); + x_vec1.Write(queue_, args.x_size, x_source_); + y_vec1.Write(queue_, args.y_size, y_source_); + a_mat1.Write(queue_, args.a_size, a_source_); + b_mat1.Write(queue_, args.b_size, b_source_); + c_mat1.Write(queue_, args.c_size, c_source_); + ap_mat1.Write(queue_, args.ap_size, ap_source_); + scalar1.Write(queue_, args.scalar_size, scalar_source_); + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; + auto status1 = run_reference_(args, buffers1, queue_); + // Tests for equality of the two status codes if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) { TestErrorCodes(status1, status2, args); diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 7c9032bd..8181aaf6 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -68,7 +68,7 @@ class TestBlas: public Tester { static const std::vector kTransposes; // Data-type dependent, see .cc-file // Shorthand for the routine-specific functions passed to the tester - using Routine = std::function&, const Buffers&, Queue&)>; + using Routine = std::function&, Buffers&, Queue&)>; using ResultGet = std::function(const Arguments&, Buffers&, Queue&)>; using ResultIndex = std::function&, const size_t, const size_t)>; using ResultIterator = std::function&)>; @@ -76,8 +76,9 @@ class TestBlas: public Tester { // Constructor, initializes the base class tester and input data TestBlas(int argc, char *argv[], const bool silent, const std::string &name, const std::vector &options, - const Routine run_routine, const Routine run_reference, const ResultGet get_result, - const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2); + const Routine run_routine, const Routine run_reference, + const ResultGet get_result, const ResultIndex get_index, + const ResultIterator get_id1, const ResultIterator get_id2); // The test functions, taking no inputs void TestRegular(std::vector> &test_vector, const std::string &name); @@ -110,9 +111,17 @@ class TestBlas: public Tester { template void RunTests(int argc, char *argv[], const bool silent, const std::string &name) { + // Sets the reference to test against + #ifdef CLBLAST_REF_CLBLAS + const auto reference_routine = C::RunReference1; // clBLAS when available + #else + const auto reference_routine = C::RunReference2; // otherwise CBLAS + #endif + // Creates a tester auto options = C::GetOptions(); - TestBlas tester{argc, argv, silent, name, options, C::RunRoutine, C::RunReference, + TestBlas tester{argc, argv, silent, name, options, + C::RunRoutine, reference_routine, C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2}; // This variable holds the arguments relevant for this routine @@ -250,23 +259,25 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name } // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - auto i_args = args; - i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize; - i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize; - for (auto &x_size: x_sizes) { i_args.x_size = x_size; - for (auto &y_size: y_sizes) { i_args.y_size = y_size; - for (auto &a_size: a_sizes) { i_args.a_size = a_size; - for (auto &b_size: b_sizes) { i_args.b_size = b_size; - for (auto &c_size: c_sizes) { i_args.c_size = c_size; - for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size; - invalid_test_vector.push_back(i_args); + #ifdef CLBLAST_REF_CLBLAS + auto invalid_test_vector = std::vector>{}; + auto i_args = args; + i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize; + i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize; + for (auto &x_size: x_sizes) { i_args.x_size = x_size; + for (auto &y_size: y_sizes) { i_args.y_size = y_size; + for (auto &a_size: a_sizes) { i_args.a_size = a_size; + for (auto &b_size: b_sizes) { i_args.b_size = b_size; + for (auto &c_size: c_sizes) { i_args.c_size = c_size; + for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size; + invalid_test_vector.push_back(i_args); + } } } } } } - } + #endif // Sets the name of this test-case auto names = std::vector{}; @@ -287,7 +298,9 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name // Runs the tests tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); + #ifdef CLBLAST_REF_CLBLAS + tester.TestInvalid(invalid_test_vector, case_name); + #endif } } } diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 8169f700..872a131a 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -69,10 +69,12 @@ Tester::Tester(int argc, char *argv[], const bool silent, kUnsupportedPrecision.c_str()); // Initializes clBLAS - auto status = clblasSetup(); - if (status != CL_SUCCESS) { - throw std::runtime_error("clBLAS setup error: "+ToString(static_cast(status))); - } + #ifdef CLBLAST_REF_CLBLAS + auto status = clblasSetup(); + if (status != CL_SUCCESS) { + throw std::runtime_error("clBLAS setup error: "+ToString(static_cast(status))); + } + #endif } // Destructor prints the summary of the test cases and cleans-up the clBLAS library @@ -87,7 +89,11 @@ Tester::~Tester() { fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str()); } fprintf(stdout, "\n"); - clblasTeardown(); + + // Cleans-up clBLAS + #ifdef CLBLAST_REF_CLBLAS + clblasTeardown(); + #endif } // ================================================================================================= diff --git a/test/correctness/tester.h b/test/correctness/tester.h index db714f3d..d489f829 100644 --- a/test/correctness/tester.h +++ b/test/correctness/tester.h @@ -23,7 +23,9 @@ #include // The libraries -#include +#ifdef CLBLAST_REF_CLBLAS + #include +#endif #include "clblast.h" #include "internal/utilities.h" @@ -92,7 +94,7 @@ class Tester { Queue queue_; // Whether or not to run the full test-suite or just a smoke test - bool full_test_; + const bool full_test_; // Retrieves the offset values to test with const std::vector GetOffsets() const; diff --git a/test/performance/client.cc b/test/performance/client.cc index 17f54231..56ab8c8d 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -24,11 +24,13 @@ namespace clblast { // Constructor template -Client::Client(const Routine run_routine, const Routine run_reference, +Client::Client(const Routine run_routine, + const Routine run_reference1, const Routine run_reference2, const std::vector &options, const GetMetric get_flops, const GetMetric get_bytes): run_routine_(run_routine), - run_reference_(run_reference), + run_reference1_(run_reference1), + run_reference2_(run_reference2), options_(options), get_flops_(get_flops), get_bytes_(get_bytes) { @@ -90,7 +92,16 @@ Arguments Client::ParseArguments(int argc, char *argv[], const GetMetric args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); - args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1); + #ifdef CLBLAST_REF_CLBLAS + args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1); + #else + args.compare_clblas = 0; + #endif + #ifdef CLBLAST_REF_CBLAS + args.compare_cblas = GetArgument(argc, argv, help, kArgComparecblas, 1); + #else + args.compare_cblas = 0; + #endif args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1}); args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0}); args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10}); @@ -120,7 +131,9 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) auto device = Device(platform, args.device_id); auto context = Context(device); auto queue = Queue(context, device); - if (args.compare_clblas) { clblasSetup(); } + #ifdef CLBLAST_REF_CLBLAS + if (args.compare_clblas) { clblasSetup(); } + #endif // Iterates over all "num_step" values jumping by "step" each time auto s = size_t{0}; @@ -167,9 +180,13 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); timings.push_back(std::pair("CLBlast", ms_clblast)); if (args.compare_clblas) { - auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS"); + auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS"); timings.push_back(std::pair("clBLAS", ms_clblas)); } + if (args.compare_cblas) { + auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS"); + timings.push_back(std::pair("CPU BLAS", ms_cblas)); + } // Prints the performance of the tested libraries PrintTableRow(args, timings); @@ -186,7 +203,9 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) } // Cleans-up and returns - if (args.compare_clblas) { clblasTeardown(); } + #ifdef CLBLAST_REF_CLBLAS + if (args.compare_clblas) { clblasTeardown(); } + #endif } // ================================================================================================= @@ -196,14 +215,17 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) // value found in the vector of timing results. The return value is in milliseconds. template double Client::TimedExecution(const size_t num_runs, const Arguments &args, - const Buffers &buffers, Queue &queue, + Buffers &buffers, Queue &queue, Routine run_blas, const std::string &library_name) { auto timings = std::vector(num_runs); for (auto &timing: timings) { auto start_time = std::chrono::steady_clock::now(); // Executes the main computation - auto status = run_blas(args, buffers, queue); + auto status = StatusCode::kSuccess; + try { + status = run_blas(args, buffers, queue); + } catch (...) { status = static_cast(kUnknownError); } if (status != StatusCode::kSuccess) { throw std::runtime_error(library_name+" error: "+ToString(static_cast(status))); } @@ -226,6 +248,7 @@ void Client::PrintTableHeader(const Arguments& args) { for (auto i=size_t{0}; i"); if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); } + if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); } fprintf(stdout, " |\n"); } @@ -233,6 +256,7 @@ void Client::PrintTableHeader(const Arguments& args) { for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); } fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1"); if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); } + if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); } fprintf(stdout, "\n"); } diff --git a/test/performance/client.h b/test/performance/client.h index 5805b8a5..8d0597d7 100644 --- a/test/performance/client.h +++ b/test/performance/client.h @@ -26,7 +26,9 @@ #include // The libraries to test -#include +#ifdef CLBLAST_REF_CLBLAS + #include +#endif #include "clblast.h" #include "internal/utilities.h" @@ -40,12 +42,12 @@ class Client { public: // Shorthand for the routine-specific functions passed to the tester - using Routine = std::function&, const Buffers&, Queue&)>; + using Routine = std::function&, Buffers&, Queue&)>; using SetMetric = std::function&)>; using GetMetric = std::function&)>; // The constructor - Client(const Routine run_routine, const Routine run_reference, + Client(const Routine run_routine, const Routine run_reference1, const Routine run_reference2, const std::vector &options, const GetMetric get_flops, const GetMetric get_bytes); @@ -61,7 +63,7 @@ class Client { private: // Runs a function a given number of times and returns the execution time of the shortest instance - double TimedExecution(const size_t num_runs, const Arguments &args, const Buffers &buffers, + double TimedExecution(const size_t num_runs, const Arguments &args, Buffers &buffers, Queue &queue, Routine run_blas, const std::string &library_name); // Prints the header of a performance-data table @@ -73,7 +75,8 @@ class Client { // The routine-specific functions passed to the tester const Routine run_routine_; - const Routine run_reference_; + const Routine run_reference1_; + const Routine run_reference2_; const std::vector options_; const GetMetric get_flops_; const GetMetric get_bytes_; @@ -81,13 +84,31 @@ class Client { // ================================================================================================= +// Bogus reference function, in case a comparison library is not available +template +static StatusCode ReferenceNotAvailable(const Arguments &, Buffers &, Queue &) { + return StatusCode::kNotImplemented; +} + // The interface to the performance client. This is a separate function in the header such that it // is automatically compiled for each routine, templated by the parameter "C". template void RunClient(int argc, char *argv[]) { + // Sets the reference to test against + #ifdef CLBLAST_REF_CLBLAS + const auto reference1 = C::RunReference1; // clBLAS when available + #else + const auto reference1 = ReferenceNotAvailable; + #endif + #ifdef CLBLAST_REF_CBLAS + const auto reference2 = C::RunReference2; // CBLAS when available + #else + const auto reference2 = ReferenceNotAvailable; + #endif + // Creates a new client - auto client = Client(C::RunRoutine, C::RunReference, C::GetOptions(), + auto client = Client(C::RunRoutine, reference1, reference2, C::GetOptions(), C::GetFlops, C::GetBytes); // Simple command line argument parser with defaults diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h index 50480f46..8f72f570 100644 --- a/test/routines/level1/xaxpy.h +++ b/test/routines/level1/xaxpy.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXaxpy { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Axpy(args.n, args.alpha, @@ -77,16 +82,33 @@ class TestXaxpy { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXaxpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXaxpy(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXaxpy(args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xcopy.h b/test/routines/level1/xcopy.h index 8d324d88..0527ca6a 100644 --- a/test/routines/level1/xcopy.h +++ b/test/routines/level1/xcopy.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -64,7 +69,7 @@ class TestXcopy { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Copy(args.n, @@ -76,16 +81,33 @@ class TestXcopy { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXcopy(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXcopy(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXcopy(args.n, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.h index 04669f52..d1c34c0f 100644 --- a/test/routines/level1/xdot.h +++ b/test/routines/level1/xdot.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -68,7 +73,7 @@ class TestXdot { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dot(args.n, @@ -81,17 +86,37 @@ class TestXdot { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXdot(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdot(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXdot(args.n, + scalar_cpu, args.dot_offset, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.h index e5b42ef4..a2742cb0 100644 --- a/test/routines/level1/xdotc.h +++ b/test/routines/level1/xdotc.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -68,7 +73,7 @@ class TestXdotc { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotc(args.n, @@ -81,17 +86,37 @@ class TestXdotc { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXdotc(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdotc(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXdotc(args.n, + scalar_cpu, args.dot_offset, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.h index 6430148c..06ce979e 100644 --- a/test/routines/level1/xdotu.h +++ b/test/routines/level1/xdotu.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -68,7 +73,7 @@ class TestXdotu { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotu(args.n, @@ -81,17 +86,37 @@ class TestXdotu { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXdotu(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdotu(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXdotu(args.n, + scalar_cpu, args.dot_offset, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xnrm2.h b/test/routines/level1/xnrm2.h index e3f77ee4..d8a0de4e 100644 --- a/test/routines/level1/xnrm2.h +++ b/test/routines/level1/xnrm2.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -64,7 +69,7 @@ class TestXnrm2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Nrm2(args.n, @@ -76,16 +81,33 @@ class TestXnrm2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXnrm2(args.n, - buffers.scalar(), args.nrm2_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXnrm2(args.n, + buffers.scalar(), args.nrm2_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXnrm2(args.n, + scalar_cpu, args.nrm2_offset, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xscal.h b/test/routines/level1/xscal.h index d990afcc..35855dbd 100644 --- a/test/routines/level1/xscal.h +++ b/test/routines/level1/xscal.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -61,7 +66,7 @@ class TestXscal { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Scal(args.n, args.alpha, @@ -72,15 +77,29 @@ class TestXscal { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXscal(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXscal(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXscal(args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xswap.h b/test/routines/level1/xswap.h index 2096a2c3..ae69d3be 100644 --- a/test/routines/level1/xswap.h +++ b/test/routines/level1/xswap.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -64,7 +69,7 @@ class TestXswap { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Swap(args.n, @@ -76,16 +81,34 @@ class TestXswap { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXswap(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXswap(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXswap(args.n, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h index 0e238804..b875075d 100644 --- a/test/routines/level2/xgbmv.h +++ b/test/routines/level2/xgbmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXgbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gbmv(args.layout, args.a_transpose, @@ -90,19 +95,41 @@ class TestXgbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgbmv(static_cast(args.layout), - static_cast(args.a_transpose), - args.m, args.n, args.kl, args.ku, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgbmv(static_cast(args.layout), + static_cast(args.a_transpose), + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.a_transpose), + args.m, args.n, args.kl, args.ku, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h index 2924d498..a70ccd34 100644 --- a/test/routines/level2/xgemv.h +++ b/test/routines/level2/xgemv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXgemv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemv(args.layout, args.a_transpose, @@ -90,19 +95,41 @@ class TestXgemv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgemv(static_cast(args.layout), - static_cast(args.a_transpose), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgemv(static_cast(args.layout), + static_cast(args.a_transpose), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgemv(convertToCBLAS(args.layout), + convertToCBLAS(args.a_transpose), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xger.h b/test/routines/level2/xger.h index 98296e92..32c2a505 100644 --- a/test/routines/level2/xger.h +++ b/test/routines/level2/xger.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -72,7 +77,7 @@ class TestXger { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Ger(args.layout, @@ -86,18 +91,39 @@ class TestXger { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXger(static_cast(args.layout), - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXger(static_cast(args.layout), + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXger(convertToCBLAS(args.layout), + args.m, args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xgerc.h b/test/routines/level2/xgerc.h index 77258d92..4b6954f6 100644 --- a/test/routines/level2/xgerc.h +++ b/test/routines/level2/xgerc.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -72,7 +77,7 @@ class TestXgerc { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gerc(args.layout, @@ -86,18 +91,39 @@ class TestXgerc { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgerc(static_cast(args.layout), - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgerc(static_cast(args.layout), + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgerc(convertToCBLAS(args.layout), + args.m, args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xgeru.h b/test/routines/level2/xgeru.h index e5f5f235..295e69e5 100644 --- a/test/routines/level2/xgeru.h +++ b/test/routines/level2/xgeru.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -72,7 +77,7 @@ class TestXgeru { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Geru(args.layout, @@ -86,18 +91,39 @@ class TestXgeru { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgeru(static_cast(args.layout), - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgeru(static_cast(args.layout), + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgeru(convertToCBLAS(args.layout), + args.m, args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h index 34e1502f..e0bdc4da 100644 --- a/test/routines/level2/xhbmv.h +++ b/test/routines/level2/xhbmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hbmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhbmv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhbmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.kl, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xhemv.h b/test/routines/level2/xhemv.h index 80e22157..fa242961 100644 --- a/test/routines/level2/xhemv.h +++ b/test/routines/level2/xhemv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhemv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhemv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhemv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhemv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhemv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xher.h b/test/routines/level2/xher.h index 53c4200f..7d0e8cc3 100644 --- a/test/routines/level2/xher.h +++ b/test/routines/level2/xher.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXher { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Her(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXher { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXher(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXher(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXher(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xher2.h b/test/routines/level2/xher2.h index c12ff827..445bba74 100644 --- a/test/routines/level2/xher2.h +++ b/test/routines/level2/xher2.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXher2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Her2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXher2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXher2(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXher2(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXher2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h index 8fd85b62..406e564f 100644 --- a/test/routines/level2/xhpmv.h +++ b/test/routines/level2/xhpmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhpmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhpmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhpmv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhpmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + ap_mat_cpu, args.ap_offset, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xhpr.h b/test/routines/level2/xhpr.h index 03599ddc..6f56d3f3 100644 --- a/test/routines/level2/xhpr.h +++ b/test/routines/level2/xhpr.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXhpr { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXhpr { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhpr(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpr(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXhpr(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xhpr2.h b/test/routines/level2/xhpr2.h index 68fbc76d..43889cb9 100644 --- a/test/routines/level2/xhpr2.h +++ b/test/routines/level2/xhpr2.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhpr2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhpr2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhpr2(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpr2(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhpr2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h index 5bc17e49..9a5c5140 100644 --- a/test/routines/level2/xsbmv.h +++ b/test/routines/level2/xsbmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXsbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Sbmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXsbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsbmv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsbmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXsbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.kl, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h index e335da42..913af0cd 100644 --- a/test/routines/level2/xspmv.h +++ b/test/routines/level2/xspmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXspmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXspmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXspmv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXspmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + ap_mat_cpu, args.ap_offset, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xspr.h b/test/routines/level2/xspr.h index 819b1ca8..bab5c541 100644 --- a/test/routines/level2/xspr.h +++ b/test/routines/level2/xspr.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXspr { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXspr { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXspr(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspr(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXspr(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xspr2.h b/test/routines/level2/xspr2.h index 43d66c9e..41a04cc0 100644 --- a/test/routines/level2/xspr2.h +++ b/test/routines/level2/xspr2.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXspr2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXspr2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXspr2(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspr2(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXspr2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xsymv.h b/test/routines/level2/xsymv.h index 13473a3e..0576bc1f 100644 --- a/test/routines/level2/xsymv.h +++ b/test/routines/level2/xsymv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXsymv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXsymv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsymv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsymv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXsymv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xsyr.h b/test/routines/level2/xsyr.h index 66b75c0c..062eea5a 100644 --- a/test/routines/level2/xsyr.h +++ b/test/routines/level2/xsyr.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXsyr { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXsyr { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyr(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXsyr(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xsyr2.h b/test/routines/level2/xsyr2.h index 32497a61..50bc3cea 100644 --- a/test/routines/level2/xsyr2.h +++ b/test/routines/level2/xsyr2.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXsyr2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXsyr2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyr2(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXsyr2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h index dbdddb65..600b4131 100644 --- a/test/routines/level2/xtbmv.h +++ b/test/routines/level2/xtbmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXtbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Tbmv(args.layout, args.triangle, args.a_transpose, args.diagonal, @@ -78,20 +83,41 @@ class TestXtbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtbmv(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), - args.n, args.kl, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtbmv(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXtbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.n, args.kl, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h index 4425765e..fc0cf393 100644 --- a/test/routines/level2/xtpmv.h +++ b/test/routines/level2/xtpmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXtpmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Tpmv(args.layout, args.triangle, args.a_transpose, args.diagonal, @@ -78,20 +83,41 @@ class TestXtpmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtpmv(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), - args.n, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtpmv(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXtpmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.n, + ap_mat_cpu, args.ap_offset, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h index 1c0c6fd8..fec72124 100644 --- a/test/routines/level2/xtrmv.h +++ b/test/routines/level2/xtrmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXtrmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmv(args.layout, args.triangle, args.a_transpose, args.diagonal, @@ -78,20 +83,41 @@ class TestXtrmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtrmv(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), - args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrmv(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXtrmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.n, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h index 695b58b7..49a92936 100644 --- a/test/routines/level3/xgemm.h +++ b/test/routines/level3/xgemm.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -78,7 +83,7 @@ class TestXgemm { static Transposes GetBTransposes(const Transposes &all) { return all; } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, @@ -92,20 +97,43 @@ class TestXgemm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgemm(static_cast(args.layout), - static_cast(args.a_transpose), - static_cast(args.b_transpose), - args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgemm(static_cast(args.layout), + static_cast(args.a_transpose), + static_cast(args.b_transpose), + args.m, args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXgemm(convertToCBLAS(args.layout), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.b_transpose), + args.m, args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h index 7b7134e5..40538417 100644 --- a/test/routines/level3/xhemm.h +++ b/test/routines/level3/xhemm.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -78,7 +83,7 @@ class TestXhemm { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemm(args.layout, args.side, args.triangle, @@ -92,20 +97,43 @@ class TestXhemm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhemm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhemm(static_cast(args.layout), + static_cast(args.side), + static_cast(args.triangle), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXhemm(convertToCBLAS(args.layout), + convertToCBLAS(args.side), + convertToCBLAS(args.triangle), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h index a7fbfcbe..1ea2ad36 100644 --- a/test/routines/level3/xher2k.h +++ b/test/routines/level3/xher2k.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXher2k { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; @@ -91,21 +96,45 @@ class TestXher2k { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto alpha2 = T{args.alpha, args.alpha}; - auto status = clblasXher2k(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto alpha2 = T{args.alpha, args.alpha}; + auto status = clblasXher2k(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, alpha2, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + auto alpha2 = T{args.alpha, args.alpha}; + cblasXher2k(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, alpha2, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h index f097672f..75a7c405 100644 --- a/test/routines/level3/xherk.h +++ b/test/routines/level3/xherk.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -69,7 +74,7 @@ class TestXherk { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Herk(args.layout, args.triangle, args.a_transpose, @@ -82,19 +87,39 @@ class TestXherk { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXherk(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXherk(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXherk(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h index 03cf5de9..f867c238 100644 --- a/test/routines/level3/xsymm.h +++ b/test/routines/level3/xsymm.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -78,7 +83,7 @@ class TestXsymm { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symm(args.layout, args.side, args.triangle, @@ -92,20 +97,43 @@ class TestXsymm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsymm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsymm(static_cast(args.layout), + static_cast(args.side), + static_cast(args.triangle), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXsymm(convertToCBLAS(args.layout), + convertToCBLAS(args.side), + convertToCBLAS(args.triangle), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h index 89e77f83..be4e1851 100644 --- a/test/routines/level3/xsyr2k.h +++ b/test/routines/level3/xsyr2k.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXsyr2k { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2k(args.layout, args.triangle, args.a_transpose, @@ -90,20 +95,43 @@ class TestXsyr2k { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyr2k(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2k(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXsyr2k(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h index 8dacb5b3..7675e2aa 100644 --- a/test/routines/level3/xsyrk.h +++ b/test/routines/level3/xsyrk.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -69,7 +74,7 @@ class TestXsyrk { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syrk(args.layout, args.triangle, args.a_transpose, @@ -82,19 +87,39 @@ class TestXsyrk { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyrk(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyrk(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXsyrk(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h index 152cdf58..a085cb15 100644 --- a/test/routines/level3/xtrmm.h +++ b/test/routines/level3/xtrmm.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -69,7 +74,7 @@ class TestXtrmm { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, @@ -82,21 +87,43 @@ class TestXtrmm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtrmm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrmm(static_cast(args.layout), + static_cast(args.side), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + cblasXtrmm(convertToCBLAS(args.layout), + convertToCBLAS(args.side), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld); + buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index c690a45c..dec272b0 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -21,6 +21,13 @@ namespace clblast { +// Conversions from CLBlast types +CBLAS_ORDER convertToCBLAS(const Layout v) { return (v == Layout::kRowMajor) ? CblasRowMajor : CblasColMajor; } +CBLAS_TRANSPOSE convertToCBLAS(const Transpose v) { return (v == Transpose::kNo) ? CblasNoTrans : (v == Transpose::kYes) ? CblasTrans : CblasConjTrans; } +CBLAS_UPLO convertToCBLAS(const Triangle v) { return (v == Triangle::kUpper) ? CblasUpper : CblasLower; } +CBLAS_DIAG convertToCBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? CblasUnit : CblasNonUnit; } +CBLAS_SIDE convertToCBLAS(const Side v) { return (v == Side::kLeft) ? CblasLeft : CblasRight; } + // OpenBLAS is not fully Netlib CBLAS compatible #ifdef OPENBLAS_VERSION using return_pointer_float = openblas_complex_float*; From cf841d1840593705d71325d6755fbe524f135627 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 3 Apr 2016 15:51:03 -0700 Subject: [PATCH 20/60] Added support for detection of CPU BLAS libraries OpenBLAS, BLIS and Accelerate on OS X --- cmake/Modules/FindCBLAS.cmake | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cmake/Modules/FindCBLAS.cmake b/cmake/Modules/FindCBLAS.cmake index 16dce243..86f14515 100644 --- a/cmake/Modules/FindCBLAS.cmake +++ b/cmake/Modules/FindCBLAS.cmake @@ -37,7 +37,10 @@ set(CBLAS_PATHS find_path(CBLAS_INCLUDE_DIRS NAMES cblas.h HINTS ${CBLAS_HINTS} - PATH_SUFFIXES include inc include/x86_64 include/x64 openblas/include + PATH_SUFFIXES + include inc include/x86_64 include/x64 + openblas/include include/blis blis/include blis/include/blis + Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers PATHS ${CBLAS_PATHS} DOC "Netlib BLAS include header cblas.h" ) @@ -45,9 +48,11 @@ mark_as_advanced(CBLAS_INCLUDE_DIRS) # Finds the library find_library(CBLAS_LIBRARIES - NAMES blas openblas atlas mkl accelerate + NAMES blas mkl blis openblas atlas accelerate HINTS ${CBLAS_HINTS} - PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import openblas/lib + PATH_SUFFIXES + lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import + openblas/lib blis/lib PATHS ${CBLAS_PATHS} DOC "Netlib BLAS library" ) From c4ab9bda6321aab66e05fd3d00e7b58443c640ef Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 3 Apr 2016 16:07:25 -0700 Subject: [PATCH 21/60] Updated the documentation in light of the support for a reference CPU BLAS library --- CHANGELOG | 1 + README.md | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index c52e041d..db14f037 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ Development version (next release) - Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) - Made the library thread-safe +- Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries - Fixed the use of events within the library - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 diff --git a/README.md b/README.md index ac614026..d69ad552 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,14 @@ The pre-requisites for compilation of CLBlast are: - Intel OpenCL - Beignet +Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either: + +* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD) +* A regular CPU Netlib BLAS library, e.g.: + - OpenBLAS + - BLIS + - Accelerate + An example of an out-of-source build (starting from the root of the CLBlast folder): mkdir build @@ -135,9 +143,9 @@ To make sure CLBlast is working correctly on your device (recommended), compile cmake -DTESTS=ON .. -Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. The library clBLAS is therefore required to be installed on your system for the CLBlast tests. +Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against. -With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test. +With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library. Performance remarks @@ -249,4 +257,3 @@ To-do list before release of version 1.0 - Support all routines supported by clBLAS - Allow the user control over events and synchronization - Add half-precision routines (e.g. HGEMM) -- Enable correctness and performance testing against a CPU-based BLAS library From 90e237b97a42ebc71771e1d023f2f8f695c8fa59 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Mon, 4 Apr 2016 08:38:31 -0700 Subject: [PATCH 22/60] Removed redundant queue synchronisation statements --- src/routines/level1/xaxpy.cc | 3 --- src/routines/level1/xcopy.cc | 3 --- src/routines/level1/xdot.cc | 3 --- src/routines/level1/xnrm2.cc | 3 --- src/routines/level1/xscal.cc | 3 --- src/routines/level1/xswap.cc | 3 --- src/routines/level2/xgemv.cc | 3 --- src/routines/level2/xger.cc | 3 --- src/routines/level2/xher.cc | 3 --- src/routines/level2/xher2.cc | 3 --- 10 files changed, 30 deletions(-) diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index f37a0724..c5acaf49 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -99,9 +99,6 @@ StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, } if (ErrorIn(status)) { return status; } - // Waits for all kernels to finish - queue_.Finish(); - // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc index 2b00d43f..8c7f8671 100644 --- a/src/routines/level1/xcopy.cc +++ b/src/routines/level1/xcopy.cc @@ -97,9 +97,6 @@ StatusCode Xcopy::DoCopy(const size_t n, } if (ErrorIn(status)) { return status; } - // Waits for all kernels to finish - queue_.Finish(); - // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc index a0c1e756..e22b0f8b 100644 --- a/src/routines/level1/xdot.cc +++ b/src/routines/level1/xdot.cc @@ -95,9 +95,6 @@ StatusCode Xdot::DoDot(const size_t n, status = RunKernel(kernel2, global2, local2); if (ErrorIn(status)) { return status; } - // Waits for all kernels to finish - queue_.Finish(); - // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index 064e68bf..685eb29f 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -87,9 +87,6 @@ StatusCode Xnrm2::DoNrm2(const size_t n, status = RunKernel(kernel2, global2, local2); if (ErrorIn(status)) { return status; } - // Waits for all kernels to finish - queue_.Finish(); - // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc index 3fc36b3d..57bbe9e8 100644 --- a/src/routines/level1/xscal.cc +++ b/src/routines/level1/xscal.cc @@ -91,9 +91,6 @@ StatusCode Xscal::DoScal(const size_t n, const T alpha, } if (ErrorIn(status)) { return status; } - // Waits for all kernels to finish - queue_.Finish(); - // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc index 123977d3..c986b3fb 100644 --- a/src/routines/level1/xswap.cc +++ b/src/routines/level1/xswap.cc @@ -97,9 +97,6 @@ StatusCode Xswap::DoSwap(const size_t n, } if (ErrorIn(status)) { return status; } - // Waits for all kernels to finish - queue_.Finish(); - // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index 79cf8248..bf7ae6fa 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -165,9 +165,6 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, status = RunKernel(kernel, global, local); if (ErrorIn(status)) { return status; } - // Waits for all kernels to finish - queue_.Finish(); - // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index d8fb6b03..9ab21bfb 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -92,9 +92,6 @@ StatusCode Xger::DoGer(const Layout layout, status = RunKernel(kernel, global, local); if (ErrorIn(status)) { return status; } - // Waits for all kernels to finish - queue_.Finish(); - // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index c025117b..1aefa240 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -102,9 +102,6 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, status = RunKernel(kernel, global, local); if (ErrorIn(status)) { return status; } - // Waits for all kernels to finish - queue_.Finish(); - // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index bfa84d18..364add12 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -94,9 +94,6 @@ StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, status = RunKernel(kernel, global, local); if (ErrorIn(status)) { return status; } - // Waits for all kernels to finish - queue_.Finish(); - // Succesfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } From c2cfee76c4d8f7486d5b62b3e0a878867a32a070 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Mon, 4 Apr 2016 08:39:13 -0700 Subject: [PATCH 23/60] Properly set warning flags for Clang --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 21254ded..a4eb5b85 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,9 +66,10 @@ else () set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable") endif() elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") - set(FLAGS "${FLAGS} -Wall -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") + set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch") set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn") + set(FLAGS "${FLAGS} -Wno-deprecated-declarations") endif() endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") From 1d3d38a2618c5663bf1549b08805137fd85f2efa Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sat, 9 Apr 2016 22:22:24 -0600 Subject: [PATCH 24/60] Events are now properly implemented using event waiting list and asking the user to wait for event completion --- README.md | 1 - include/internal/clpp11.h | 55 ++++++++--- include/internal/routine.h | 14 ++- include/internal/routines/level1/xaxpy.h | 3 +- include/internal/routines/level1/xcopy.h | 3 +- include/internal/routines/level1/xdot.h | 3 +- include/internal/routines/level1/xdotc.h | 2 +- include/internal/routines/level1/xdotu.h | 2 +- include/internal/routines/level1/xnrm2.h | 3 +- include/internal/routines/level1/xscal.h | 3 +- include/internal/routines/level1/xswap.h | 3 +- include/internal/routines/level2/xgbmv.h | 2 +- include/internal/routines/level2/xgemv.h | 3 +- include/internal/routines/level2/xger.h | 3 +- include/internal/routines/level2/xgerc.h | 2 +- include/internal/routines/level2/xgeru.h | 2 +- include/internal/routines/level2/xhbmv.h | 2 +- include/internal/routines/level2/xhemv.h | 2 +- include/internal/routines/level2/xher.h | 3 +- include/internal/routines/level2/xher2.h | 3 +- include/internal/routines/level2/xhpmv.h | 2 +- include/internal/routines/level2/xhpr.h | 2 +- include/internal/routines/level2/xhpr2.h | 2 +- include/internal/routines/level2/xsbmv.h | 2 +- include/internal/routines/level2/xspmv.h | 2 +- include/internal/routines/level2/xspr.h | 2 +- include/internal/routines/level2/xspr2.h | 2 +- include/internal/routines/level2/xsymv.h | 2 +- include/internal/routines/level2/xsyr.h | 2 +- include/internal/routines/level2/xsyr2.h | 2 +- include/internal/routines/level2/xtbmv.h | 2 +- include/internal/routines/level2/xtpmv.h | 2 +- include/internal/routines/level2/xtrmv.h | 2 +- include/internal/routines/level3/xgemm.h | 3 +- include/internal/routines/level3/xhemm.h | 2 +- include/internal/routines/level3/xher2k.h | 3 +- include/internal/routines/level3/xherk.h | 3 +- include/internal/routines/level3/xsymm.h | 2 +- include/internal/routines/level3/xsyr2k.h | 3 +- include/internal/routines/level3/xsyrk.h | 3 +- include/internal/routines/level3/xtrmm.h | 2 +- samples/sgemm.cc | 7 +- scripts/generator/generator.py | 3 +- src/clblast.cc | 114 ++++++++-------------- src/routine.cc | 31 +++--- src/routines/level1/xaxpy.cc | 6 +- src/routines/level1/xcopy.cc | 6 +- src/routines/level1/xdot.cc | 11 ++- src/routines/level1/xdotc.cc | 2 +- src/routines/level1/xdotu.cc | 2 +- src/routines/level1/xnrm2.cc | 10 +- src/routines/level1/xscal.cc | 6 +- src/routines/level1/xswap.cc | 6 +- src/routines/level2/xgbmv.cc | 2 +- src/routines/level2/xgemv.cc | 4 +- src/routines/level2/xger.cc | 4 +- src/routines/level2/xgerc.cc | 2 +- src/routines/level2/xgeru.cc | 2 +- src/routines/level2/xhbmv.cc | 2 +- src/routines/level2/xhemv.cc | 2 +- src/routines/level2/xher.cc | 4 +- src/routines/level2/xher2.cc | 4 +- src/routines/level2/xhpmv.cc | 2 +- src/routines/level2/xhpr.cc | 2 +- src/routines/level2/xhpr2.cc | 2 +- src/routines/level2/xsbmv.cc | 2 +- src/routines/level2/xspmv.cc | 2 +- src/routines/level2/xspr.cc | 2 +- src/routines/level2/xspr2.cc | 2 +- src/routines/level2/xsymv.cc | 2 +- src/routines/level2/xsyr.cc | 2 +- src/routines/level2/xsyr2.cc | 2 +- src/routines/level2/xtbmv.cc | 2 +- src/routines/level2/xtpmv.cc | 2 +- src/routines/level2/xtrmv.cc | 2 +- src/routines/level3/xgemm.cc | 28 ++++-- src/routines/level3/xhemm.cc | 8 +- src/routines/level3/xher2k.cc | 42 ++++++-- src/routines/level3/xherk.cc | 28 ++++-- src/routines/level3/xsymm.cc | 8 +- src/routines/level3/xsyr2k.cc | 32 ++++-- src/routines/level3/xsyrk.cc | 24 ++++- src/routines/level3/xtrmm.cc | 8 +- 83 files changed, 360 insertions(+), 235 deletions(-) diff --git a/README.md b/README.md index d69ad552..74d8c9cc 100644 --- a/README.md +++ b/README.md @@ -255,5 +255,4 @@ To-do list before release of version 1.0 ------------- - Support all routines supported by clBLAS -- Allow the user control over events and synchronization - Add half-precision routines (e.g. HGEMM) diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index 00905ef7..543d423a 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -73,29 +73,41 @@ class Event { public: // Constructor based on the regular OpenCL data-type - explicit Event(cl_event* event): event_(event) { } + explicit Event(const cl_event event): event_(event) { } + + // Regular constructor + explicit Event(): event_(nullptr) { } + + // Waits for completion of this event + void WaitForCompletion() const { + CheckError(clWaitForEvents(1, &event_)); + } // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on // the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation: // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx float GetElapsedTime() const { - CheckError(clWaitForEvents(1, event_)); + WaitForCompletion(); auto bytes = size_t{0}; - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes); auto time_start = size_t{0}; - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr); + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes); auto time_end = size_t{0}; - clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); + clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr); return (time_end - time_start) * 1.0e-6f; } // Accessor to the private data-member - cl_event& operator()() { return *event_; } + cl_event& operator()() { return event_; } + cl_event* pointer() { return &event_; } private: - cl_event* event_; + cl_event event_; }; +// Pointer to an OpenCL event +using EventPointer = cl_event*; + // ================================================================================================= // C++11 version of 'cl_platform_id' @@ -600,17 +612,36 @@ class Kernel { // Launches a kernel onto the specified queue void Launch(const Queue &queue, const std::vector &global, - const std::vector &local, Event &event) { + const std::vector &local, EventPointer event) { CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), local.data(), - 0, nullptr, &(event()))); + 0, nullptr, event)); + } + + // As above, but with an event waiting list + void Launch(const Queue &queue, const std::vector &global, + const std::vector &local, EventPointer event, + std::vector& waitForEvents) { + if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); } + + // Builds a plain version of the events waiting list + auto waitForEventsPlain = std::vector(); + for (auto &waitEvent : waitForEvents) { + waitForEventsPlain.push_back(waitEvent()); + } + + // Launches the kernel while waiting for other events + CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), + nullptr, global.data(), local.data(), + waitForEventsPlain.size(), waitForEventsPlain.data(), + event)); } // As above, but with the default local workgroup size - void Launch(const Queue &queue, const std::vector &global, Event &event) { + void Launch(const Queue &queue, const std::vector &global, EventPointer event) { CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), nullptr, - 0, nullptr, &(event()))); + 0, nullptr, event)); } // Accessor to the private data-member diff --git a/include/internal/routine.h b/include/internal/routine.h index 5f5b8211..b2b6f622 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -55,7 +55,7 @@ class Routine { static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } // Base class constructor - explicit Routine(Queue &queue, Event &event, const std::string &name, + explicit Routine(Queue &queue, EventPointer event, const std::string &name, const std::vector &routines, const Precision precision); // Set-up phase of the kernel @@ -65,7 +65,12 @@ class Routine { // Runs a kernel given the global and local thread sizes StatusCode RunKernel(Kernel &kernel, std::vector &global, - const std::vector &local); + const std::vector &local, EventPointer event, + std::vector& waitForEvents); + + // As above, but without an event waiting list + StatusCode RunKernel(Kernel &kernel, std::vector &global, + const std::vector &local, EventPointer event); // Tests for valid inputs of matrices A, B, and C StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, @@ -87,7 +92,8 @@ class Routine { // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write // to symmetric and triangular matrices through optional arguments. - StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, + StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector& waitForEvents, + const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer &src, const size_t dest_one, const size_t dest_two, @@ -114,7 +120,7 @@ class Routine { // The OpenCL objects, accessible only from derived classes Queue queue_; - Event event_; + EventPointer event_; const Context context_; const Device device_; diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h index 689cf169..bc00c8e3 100644 --- a/include/internal/routines/level1/xaxpy.h +++ b/include/internal/routines/level1/xaxpy.h @@ -28,6 +28,7 @@ class Xaxpy: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestVectorY; @@ -35,7 +36,7 @@ class Xaxpy: public Routine { using Routine::ErrorIn; // Constructor - Xaxpy(Queue &queue, Event &event, const std::string &name = "AXPY"); + Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY"); // Templated-precision implementation of the routine StatusCode DoAxpy(const size_t n, const T alpha, diff --git a/include/internal/routines/level1/xcopy.h b/include/internal/routines/level1/xcopy.h index 15f339aa..5786cb0f 100644 --- a/include/internal/routines/level1/xcopy.h +++ b/include/internal/routines/level1/xcopy.h @@ -28,6 +28,7 @@ class Xcopy: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestVectorY; @@ -35,7 +36,7 @@ class Xcopy: public Routine { using Routine::ErrorIn; // Constructor - Xcopy(Queue &queue, Event &event, const std::string &name = "COPY"); + Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY"); // Templated-precision implementation of the routine StatusCode DoCopy(const size_t n, diff --git a/include/internal/routines/level1/xdot.h b/include/internal/routines/level1/xdot.h index 64b62945..95a7ad07 100644 --- a/include/internal/routines/level1/xdot.h +++ b/include/internal/routines/level1/xdot.h @@ -28,6 +28,7 @@ class Xdot: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::context_; using Routine::GetProgramFromCache; using Routine::TestVectorX; @@ -37,7 +38,7 @@ class Xdot: public Routine { using Routine::ErrorIn; // Constructor - Xdot(Queue &queue, Event &event, const std::string &name = "DOT"); + Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT"); // Templated-precision implementation of the routine StatusCode DoDot(const size_t n, diff --git a/include/internal/routines/level1/xdotc.h b/include/internal/routines/level1/xdotc.h index 726cec7c..0dc2cfe9 100644 --- a/include/internal/routines/level1/xdotc.h +++ b/include/internal/routines/level1/xdotc.h @@ -28,7 +28,7 @@ class Xdotc: public Xdot { using Xdot::DoDot; // Constructor - Xdotc(Queue &queue, Event &event, const std::string &name = "DOTC"); + Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC"); // Templated-precision implementation of the routine StatusCode DoDotc(const size_t n, diff --git a/include/internal/routines/level1/xdotu.h b/include/internal/routines/level1/xdotu.h index 825ebb78..98988744 100644 --- a/include/internal/routines/level1/xdotu.h +++ b/include/internal/routines/level1/xdotu.h @@ -28,7 +28,7 @@ class Xdotu: public Xdot { using Xdot::DoDot; // Constructor - Xdotu(Queue &queue, Event &event, const std::string &name = "DOTU"); + Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU"); // Templated-precision implementation of the routine StatusCode DoDotu(const size_t n, diff --git a/include/internal/routines/level1/xnrm2.h b/include/internal/routines/level1/xnrm2.h index b3fffef6..6f6ca74f 100644 --- a/include/internal/routines/level1/xnrm2.h +++ b/include/internal/routines/level1/xnrm2.h @@ -28,6 +28,7 @@ class Xnrm2: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::context_; using Routine::GetProgramFromCache; using Routine::TestVectorX; @@ -36,7 +37,7 @@ class Xnrm2: public Routine { using Routine::ErrorIn; // Constructor - Xnrm2(Queue &queue, Event &event, const std::string &name = "NRM2"); + Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2"); // Templated-precision implementation of the routine StatusCode DoNrm2(const size_t n, diff --git a/include/internal/routines/level1/xscal.h b/include/internal/routines/level1/xscal.h index d97b5a07..e10a201d 100644 --- a/include/internal/routines/level1/xscal.h +++ b/include/internal/routines/level1/xscal.h @@ -28,13 +28,14 @@ class Xscal: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::RunKernel; using Routine::ErrorIn; // Constructor - Xscal(Queue &queue, Event &event, const std::string &name = "SCAL"); + Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL"); // Templated-precision implementation of the routine StatusCode DoScal(const size_t n, const T alpha, diff --git a/include/internal/routines/level1/xswap.h b/include/internal/routines/level1/xswap.h index fe79882b..0f240763 100644 --- a/include/internal/routines/level1/xswap.h +++ b/include/internal/routines/level1/xswap.h @@ -28,6 +28,7 @@ class Xswap: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestVectorY; @@ -35,7 +36,7 @@ class Xswap: public Routine { using Routine::ErrorIn; // Constructor - Xswap(Queue &queue, Event &event, const std::string &name = "SWAP"); + Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP"); // Templated-precision implementation of the routine StatusCode DoSwap(const size_t n, diff --git a/include/internal/routines/level2/xgbmv.h b/include/internal/routines/level2/xgbmv.h index 27b033e9..bc94c77d 100644 --- a/include/internal/routines/level2/xgbmv.h +++ b/include/internal/routines/level2/xgbmv.h @@ -30,7 +30,7 @@ class Xgbmv: public Xgemv { using Xgemv::MatVec; // Constructor - Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV"); + Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV"); // Templated-precision implementation of the routine StatusCode DoGbmv(const Layout layout, const Transpose a_transpose, diff --git a/include/internal/routines/level2/xgemv.h b/include/internal/routines/level2/xgemv.h index b31565ec..0b2a8e66 100644 --- a/include/internal/routines/level2/xgemv.h +++ b/include/internal/routines/level2/xgemv.h @@ -28,6 +28,7 @@ class Xgemv: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestVectorY; @@ -37,7 +38,7 @@ class Xgemv: public Routine { using Routine::ErrorIn; // Constructor - Xgemv(Queue &queue, Event &event, const std::string &name = "GEMV"); + Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV"); // Templated-precision implementation of the routine StatusCode DoGemv(const Layout layout, const Transpose a_transpose, diff --git a/include/internal/routines/level2/xger.h b/include/internal/routines/level2/xger.h index 45ecea10..5ace9da6 100644 --- a/include/internal/routines/level2/xger.h +++ b/include/internal/routines/level2/xger.h @@ -28,6 +28,7 @@ class Xger: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestVectorY; @@ -36,7 +37,7 @@ class Xger: public Routine { using Routine::ErrorIn; // Constructor - Xger(Queue &queue, Event &event, const std::string &name = "GER"); + Xger(Queue &queue, EventPointer event, const std::string &name = "GER"); // Templated-precision implementation of the routine StatusCode DoGer(const Layout layout, diff --git a/include/internal/routines/level2/xgerc.h b/include/internal/routines/level2/xgerc.h index 8e515a14..6d06ef94 100644 --- a/include/internal/routines/level2/xgerc.h +++ b/include/internal/routines/level2/xgerc.h @@ -28,7 +28,7 @@ class Xgerc: public Xger { using Xger::DoGer; // Constructor - Xgerc(Queue &queue, Event &event, const std::string &name = "GERC"); + Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC"); // Templated-precision implementation of the routine StatusCode DoGerc(const Layout layout, diff --git a/include/internal/routines/level2/xgeru.h b/include/internal/routines/level2/xgeru.h index ec485c37..45ce1cba 100644 --- a/include/internal/routines/level2/xgeru.h +++ b/include/internal/routines/level2/xgeru.h @@ -28,7 +28,7 @@ class Xgeru: public Xger { using Xger::DoGer; // Constructor - Xgeru(Queue &queue, Event &event, const std::string &name = "GERU"); + Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU"); // Templated-precision implementation of the routine StatusCode DoGeru(const Layout layout, diff --git a/include/internal/routines/level2/xhbmv.h b/include/internal/routines/level2/xhbmv.h index 65138424..f0a6212c 100644 --- a/include/internal/routines/level2/xhbmv.h +++ b/include/internal/routines/level2/xhbmv.h @@ -30,7 +30,7 @@ class Xhbmv: public Xgemv { using Xgemv::MatVec; // Constructor - Xhbmv(Queue &queue, Event &event, const std::string &name = "HBMV"); + Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV"); // Templated-precision implementation of the routine StatusCode DoHbmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xhemv.h b/include/internal/routines/level2/xhemv.h index b74db760..3daf2457 100644 --- a/include/internal/routines/level2/xhemv.h +++ b/include/internal/routines/level2/xhemv.h @@ -30,7 +30,7 @@ class Xhemv: public Xgemv { using Xgemv::MatVec; // Constructor - Xhemv(Queue &queue, Event &event, const std::string &name = "HEMV"); + Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV"); // Templated-precision implementation of the routine StatusCode DoHemv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xher.h b/include/internal/routines/level2/xher.h index 6322265b..861ba302 100644 --- a/include/internal/routines/level2/xher.h +++ b/include/internal/routines/level2/xher.h @@ -28,6 +28,7 @@ class Xher: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestMatrixA; @@ -36,7 +37,7 @@ class Xher: public Routine { using Routine::ErrorIn; // Constructor - Xher(Queue &queue, Event &event, const std::string &name = "HER"); + Xher(Queue &queue, EventPointer event, const std::string &name = "HER"); // Translates alpha of type 'U' into type 'T' T GetAlpha(const U alpha); diff --git a/include/internal/routines/level2/xher2.h b/include/internal/routines/level2/xher2.h index 26f69046..9a23199e 100644 --- a/include/internal/routines/level2/xher2.h +++ b/include/internal/routines/level2/xher2.h @@ -28,6 +28,7 @@ class Xher2: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::GetProgramFromCache; using Routine::TestVectorX; using Routine::TestVectorY; @@ -37,7 +38,7 @@ class Xher2: public Routine { using Routine::ErrorIn; // Constructor - Xher2(Queue &queue, Event &event, const std::string &name = "HER2"); + Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2"); // Templated-precision implementation of the routine StatusCode DoHer2(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xhpmv.h b/include/internal/routines/level2/xhpmv.h index 48f1ed3f..a1d5595a 100644 --- a/include/internal/routines/level2/xhpmv.h +++ b/include/internal/routines/level2/xhpmv.h @@ -30,7 +30,7 @@ class Xhpmv: public Xgemv { using Xgemv::MatVec; // Constructor - Xhpmv(Queue &queue, Event &event, const std::string &name = "HPMV"); + Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV"); // Templated-precision implementation of the routine StatusCode DoHpmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xhpr.h b/include/internal/routines/level2/xhpr.h index a0c3cb92..6554d74c 100644 --- a/include/internal/routines/level2/xhpr.h +++ b/include/internal/routines/level2/xhpr.h @@ -28,7 +28,7 @@ class Xhpr: public Xher { using Xher::DoHer; // Constructor - Xhpr(Queue &queue, Event &event, const std::string &name = "HPR"); + Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR"); // Templated-precision implementation of the routine StatusCode DoHpr(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xhpr2.h b/include/internal/routines/level2/xhpr2.h index fd243d33..d95e7b61 100644 --- a/include/internal/routines/level2/xhpr2.h +++ b/include/internal/routines/level2/xhpr2.h @@ -28,7 +28,7 @@ class Xhpr2: public Xher2 { using Xher2::DoHer2; // Constructor - Xhpr2(Queue &queue, Event &event, const std::string &name = "HPR2"); + Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2"); // Templated-precision implementation of the routine StatusCode DoHpr2(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xsbmv.h b/include/internal/routines/level2/xsbmv.h index bb24d8f4..4328e377 100644 --- a/include/internal/routines/level2/xsbmv.h +++ b/include/internal/routines/level2/xsbmv.h @@ -30,7 +30,7 @@ class Xsbmv: public Xgemv { using Xgemv::MatVec; // Constructor - Xsbmv(Queue &queue, Event &event, const std::string &name = "SBMV"); + Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV"); // Templated-precision implementation of the routine StatusCode DoSbmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xspmv.h b/include/internal/routines/level2/xspmv.h index 88f02a2f..ca3e28b6 100644 --- a/include/internal/routines/level2/xspmv.h +++ b/include/internal/routines/level2/xspmv.h @@ -30,7 +30,7 @@ class Xspmv: public Xgemv { using Xgemv::MatVec; // Constructor - Xspmv(Queue &queue, Event &event, const std::string &name = "SPMV"); + Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV"); // Templated-precision implementation of the routine StatusCode DoSpmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xspr.h b/include/internal/routines/level2/xspr.h index 5b01d2cb..7e91abc5 100644 --- a/include/internal/routines/level2/xspr.h +++ b/include/internal/routines/level2/xspr.h @@ -28,7 +28,7 @@ class Xspr: public Xher { using Xher::DoHer; // Constructor - Xspr(Queue &queue, Event &event, const std::string &name = "SPR"); + Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR"); // Templated-precision implementation of the routine StatusCode DoSpr(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xspr2.h b/include/internal/routines/level2/xspr2.h index 3d5f4992..a34be8e8 100644 --- a/include/internal/routines/level2/xspr2.h +++ b/include/internal/routines/level2/xspr2.h @@ -28,7 +28,7 @@ class Xspr2: public Xher2 { using Xher2::DoHer2; // Constructor - Xspr2(Queue &queue, Event &event, const std::string &name = "SPR2"); + Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2"); // Templated-precision implementation of the routine StatusCode DoSpr2(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xsymv.h b/include/internal/routines/level2/xsymv.h index c7b92702..98a0ce88 100644 --- a/include/internal/routines/level2/xsymv.h +++ b/include/internal/routines/level2/xsymv.h @@ -30,7 +30,7 @@ class Xsymv: public Xgemv { using Xgemv::MatVec; // Constructor - Xsymv(Queue &queue, Event &event, const std::string &name = "SYMV"); + Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV"); // Templated-precision implementation of the routine StatusCode DoSymv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xsyr.h b/include/internal/routines/level2/xsyr.h index 9704a881..f88498ae 100644 --- a/include/internal/routines/level2/xsyr.h +++ b/include/internal/routines/level2/xsyr.h @@ -28,7 +28,7 @@ class Xsyr: public Xher { using Xher::DoHer; // Constructor - Xsyr(Queue &queue, Event &event, const std::string &name = "SYR"); + Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR"); // Templated-precision implementation of the routine StatusCode DoSyr(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xsyr2.h b/include/internal/routines/level2/xsyr2.h index f4dc9375..d2d3143a 100644 --- a/include/internal/routines/level2/xsyr2.h +++ b/include/internal/routines/level2/xsyr2.h @@ -28,7 +28,7 @@ class Xsyr2: public Xher2 { using Xher2::DoHer2; // Constructor - Xsyr2(Queue &queue, Event &event, const std::string &name = "SYR2"); + Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2"); // Templated-precision implementation of the routine StatusCode DoSyr2(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xtbmv.h b/include/internal/routines/level2/xtbmv.h index 89c90193..3b358080 100644 --- a/include/internal/routines/level2/xtbmv.h +++ b/include/internal/routines/level2/xtbmv.h @@ -34,7 +34,7 @@ class Xtbmv: public Xgemv { using Xgemv::MatVec; // Constructor - Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV"); + Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV"); // Templated-precision implementation of the routine StatusCode DoTbmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xtpmv.h b/include/internal/routines/level2/xtpmv.h index 183d3505..f306cf4a 100644 --- a/include/internal/routines/level2/xtpmv.h +++ b/include/internal/routines/level2/xtpmv.h @@ -34,7 +34,7 @@ class Xtpmv: public Xgemv { using Xgemv::MatVec; // Constructor - Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV"); + Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV"); // Templated-precision implementation of the routine StatusCode DoTpmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level2/xtrmv.h b/include/internal/routines/level2/xtrmv.h index dadfbc98..cf0824a4 100644 --- a/include/internal/routines/level2/xtrmv.h +++ b/include/internal/routines/level2/xtrmv.h @@ -34,7 +34,7 @@ class Xtrmv: public Xgemv { using Xgemv::MatVec; // Constructor - Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV"); + Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV"); // Templated-precision implementation of the routine StatusCode DoTrmv(const Layout layout, const Triangle triangle, diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h index 9b40a7fc..85fb0616 100644 --- a/include/internal/routines/level3/xgemm.h +++ b/include/internal/routines/level3/xgemm.h @@ -28,6 +28,7 @@ class Xgemm: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::context_; using Routine::GetProgramFromCache; using Routine::PadCopyTransposeMatrix; @@ -38,7 +39,7 @@ class Xgemm: public Routine { using Routine::ErrorIn; // Constructor - Xgemm(Queue &queue, Event &event, const std::string &name = "GEMM"); + Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM"); // Templated-precision implementation of the routine StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h index ca38ca08..ec42b569 100644 --- a/include/internal/routines/level3/xhemm.h +++ b/include/internal/routines/level3/xhemm.h @@ -37,7 +37,7 @@ class Xhemm: public Xgemm { using Xgemm::DoGemm; // Constructor - Xhemm(Queue &queue, Event &event, const std::string &name = "HEMM"); + Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM"); // Templated-precision implementation of the routine StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h index 7113a172..623afd49 100644 --- a/include/internal/routines/level3/xher2k.h +++ b/include/internal/routines/level3/xher2k.h @@ -30,6 +30,7 @@ class Xher2k: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::context_; using Routine::GetProgramFromCache; using Routine::PadCopyTransposeMatrix; @@ -40,7 +41,7 @@ class Xher2k: public Routine { using Routine::ErrorIn; // Constructor - Xher2k(Queue &queue, Event &event, const std::string &name = "HER2K"); + Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K"); // Templated-precision implementation of the routine StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h index 47112c2c..629695ff 100644 --- a/include/internal/routines/level3/xherk.h +++ b/include/internal/routines/level3/xherk.h @@ -30,6 +30,7 @@ class Xherk: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::context_; using Routine::GetProgramFromCache; using Routine::PadCopyTransposeMatrix; @@ -39,7 +40,7 @@ class Xherk: public Routine { using Routine::ErrorIn; // Constructor - Xherk(Queue &queue, Event &event, const std::string &name = "HERK"); + Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK"); // Templated-precision implementation of the routine StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h index 9fc80eb4..16ad6f53 100644 --- a/include/internal/routines/level3/xsymm.h +++ b/include/internal/routines/level3/xsymm.h @@ -39,7 +39,7 @@ class Xsymm: public Xgemm { using Xgemm::DoGemm; // Constructor - Xsymm(Queue &queue, Event &event, const std::string &name = "SYMM"); + Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM"); // Templated-precision implementation of the routine StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h index c4679028..88669626 100644 --- a/include/internal/routines/level3/xsyr2k.h +++ b/include/internal/routines/level3/xsyr2k.h @@ -30,6 +30,7 @@ class Xsyr2k: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::context_; using Routine::GetProgramFromCache; using Routine::PadCopyTransposeMatrix; @@ -40,7 +41,7 @@ class Xsyr2k: public Routine { using Routine::ErrorIn; // Constructor - Xsyr2k(Queue &queue, Event &event, const std::string &name = "SYR2K"); + Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K"); // Templated-precision implementation of the routine StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h index abf6b681..e95c7c1c 100644 --- a/include/internal/routines/level3/xsyrk.h +++ b/include/internal/routines/level3/xsyrk.h @@ -32,6 +32,7 @@ class Xsyrk: public Routine { using Routine::db_; using Routine::source_string_; using Routine::queue_; + using Routine::event_; using Routine::context_; using Routine::GetProgramFromCache; using Routine::PadCopyTransposeMatrix; @@ -41,7 +42,7 @@ class Xsyrk: public Routine { using Routine::ErrorIn; // Constructor - Xsyrk(Queue &queue, Event &event, const std::string &name = "SYRK"); + Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK"); // Templated-precision implementation of the routine StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h index a1f4d15c..01f6594d 100644 --- a/include/internal/routines/level3/xtrmm.h +++ b/include/internal/routines/level3/xtrmm.h @@ -38,7 +38,7 @@ class Xtrmm: public Xgemm { using Xgemm::DoGemm; // Constructor - Xtrmm(Queue &queue, Event &event, const std::string &name = "TRMM"); + Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM"); // Templated-precision implementation of the routine StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, diff --git a/samples/sgemm.cc b/samples/sgemm.cc index 785b051c..78f2dee8 100644 --- a/samples/sgemm.cc +++ b/samples/sgemm.cc @@ -61,7 +61,7 @@ int main() { // Creates the OpenCL context, queue, and an event auto context = cl::Context({device}); auto queue = cl::CommandQueue(context, device); - auto event = cl::Event(); + auto event = cl_event{nullptr}; // Populate host matrices with some example data auto host_a = std::vector(m*k); @@ -84,7 +84,6 @@ int main() { // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision. auto queue_plain = queue(); - auto event_plain = event(); auto status = Gemm(clblast::Layout::kRowMajor, clblast::Transpose::kNo, clblast::Transpose::kNo, m, n, k, @@ -93,10 +92,10 @@ int main() { device_b(), 0, b_ld, beta, device_c(), 0, c_ld, - &queue_plain, &event_plain); + &queue_plain, &event); // Record the execution time - event.wait(); + clWaitForEvents(1, &event); auto elapsed_time = std::chrono::steady_clock::now() - start_time; auto time_ms = std::chrono::duration(elapsed_time).count(); diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index bdf6b9d7..8cd35f95 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -169,8 +169,7 @@ def clblast_cc(routines): if routine.implemented: result += routine.RoutineHeaderCPP(12, "")+" {\n" result += " auto queue_cpp = Queue(*queue);\n" - result += " auto event_cpp = Event(event);\n" - result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event_cpp);\n" + result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event);\n" result += " auto status = routine.SetUp();\n" result += " if (status != StatusCode::kSuccess) { return status; }\n" result += " return routine.Do"+routine.name.capitalize()+"(" diff --git a/src/clblast.cc b/src/clblast.cc index 75893ee9..4f4b6078 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -161,8 +161,7 @@ StatusCode Swap(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xswap(queue_cpp, event_cpp); + auto routine = Xswap(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSwap(n, @@ -193,8 +192,7 @@ StatusCode Scal(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xscal(queue_cpp, event_cpp); + auto routine = Xscal(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoScal(n, @@ -225,8 +223,7 @@ StatusCode Copy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xcopy(queue_cpp, event_cpp); + auto routine = Xcopy(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoCopy(n, @@ -258,8 +255,7 @@ StatusCode Axpy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xaxpy(queue_cpp, event_cpp); + auto routine = Xaxpy(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoAxpy(n, @@ -296,8 +292,7 @@ StatusCode Dot(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xdot(queue_cpp, event_cpp); + auto routine = Xdot(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoDot(n, @@ -324,8 +319,7 @@ StatusCode Dotu(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xdotu(queue_cpp, event_cpp); + auto routine = Xdotu(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoDotu(n, @@ -352,8 +346,7 @@ StatusCode Dotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xdotc(queue_cpp, event_cpp); + auto routine = Xdotc(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoDotc(n, @@ -379,8 +372,7 @@ StatusCode Nrm2(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xnrm2(queue_cpp, event_cpp); + auto routine = Xnrm2(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoNrm2(n, @@ -419,8 +411,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xgemv(queue_cpp, event_cpp); + auto routine = Xgemv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGemv(layout, a_transpose, @@ -475,8 +466,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xgbmv(queue_cpp, event_cpp); + auto routine = Xgbmv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGbmv(layout, a_transpose, @@ -531,8 +521,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhemv(queue_cpp, event_cpp); + auto routine = Xhemv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHemv(layout, triangle, @@ -571,8 +560,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhbmv(queue_cpp, event_cpp); + auto routine = Xhbmv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHbmv(layout, triangle, @@ -611,8 +599,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhpmv(queue_cpp, event_cpp); + auto routine = Xhpmv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHpmv(layout, triangle, @@ -651,8 +638,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsymv(queue_cpp, event_cpp); + auto routine = Xsymv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSymv(layout, triangle, @@ -691,8 +677,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsbmv(queue_cpp, event_cpp); + auto routine = Xsbmv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSbmv(layout, triangle, @@ -731,8 +716,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xspmv(queue_cpp, event_cpp); + auto routine = Xspmv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSpmv(layout, triangle, @@ -768,8 +752,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xtrmv(queue_cpp, event_cpp); + auto routine = Xtrmv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoTrmv(layout, triangle, a_transpose, diagonal, @@ -806,8 +789,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xtbmv(queue_cpp, event_cpp); + auto routine = Xtbmv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoTbmv(layout, triangle, a_transpose, diagonal, @@ -844,8 +826,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xtpmv(queue_cpp, event_cpp); + auto routine = Xtpmv(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoTpmv(layout, triangle, a_transpose, diagonal, @@ -974,8 +955,7 @@ StatusCode Ger(const Layout layout, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xger(queue_cpp, event_cpp); + auto routine = Xger(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGer(layout, @@ -1010,8 +990,7 @@ StatusCode Geru(const Layout layout, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xgeru(queue_cpp, event_cpp); + auto routine = Xgeru(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGeru(layout, @@ -1046,8 +1025,7 @@ StatusCode Gerc(const Layout layout, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xgerc(queue_cpp, event_cpp); + auto routine = Xgerc(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGerc(layout, @@ -1081,8 +1059,7 @@ StatusCode Her(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xher,T>(queue_cpp, event_cpp); + auto routine = Xher,T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHer(layout, triangle, @@ -1113,8 +1090,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhpr,T>(queue_cpp, event_cpp); + auto routine = Xhpr,T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHpr(layout, triangle, @@ -1146,8 +1122,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xher2(queue_cpp, event_cpp); + auto routine = Xher2(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHer2(layout, triangle, @@ -1182,8 +1157,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhpr2(queue_cpp, event_cpp); + auto routine = Xhpr2(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHpr2(layout, triangle, @@ -1217,8 +1191,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsyr(queue_cpp, event_cpp); + auto routine = Xsyr(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSyr(layout, triangle, @@ -1249,8 +1222,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xspr(queue_cpp, event_cpp); + auto routine = Xspr(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSpr(layout, triangle, @@ -1282,8 +1254,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsyr2(queue_cpp, event_cpp); + auto routine = Xsyr2(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSyr2(layout, triangle, @@ -1318,8 +1289,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xspr2(queue_cpp, event_cpp); + auto routine = Xspr2(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSpr2(layout, triangle, @@ -1359,8 +1329,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xgemm(queue_cpp, event_cpp); + auto routine = Xgemm(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoGemm(layout, a_transpose, b_transpose, @@ -1415,8 +1384,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsymm(queue_cpp, event_cpp); + auto routine = Xsymm(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSymm(layout, side, triangle, @@ -1471,8 +1439,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xhemm(queue_cpp, event_cpp); + auto routine = Xhemm(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHemm(layout, side, triangle, @@ -1510,8 +1477,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsyrk(queue_cpp, event_cpp); + auto routine = Xsyrk(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSyrk(layout, triangle, a_transpose, @@ -1560,8 +1526,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xherk,T>(queue_cpp, event_cpp); + auto routine = Xherk,T>(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHerk(layout, triangle, a_transpose, @@ -1597,8 +1562,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xsyr2k(queue_cpp, event_cpp); + auto routine = Xsyr2k(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoSyr2k(layout, triangle, ab_transpose, @@ -1653,8 +1617,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xher2k(queue_cpp, event_cpp); + auto routine = Xher2k(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoHer2k(layout, triangle, ab_transpose, @@ -1691,8 +1654,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { auto queue_cpp = Queue(*queue); - auto event_cpp = Event(event); - auto routine = Xtrmm(queue_cpp, event_cpp); + auto routine = Xtrmm(queue_cpp, event); auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, diff --git a/src/routine.cc b/src/routine.cc index ff7b3e1a..b5ba63eb 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -26,7 +26,7 @@ template std::mutex Routine::program_cache_mutex_; // Constructor: not much here, because no status codes can be returned template -Routine::Routine(Queue &queue, Event &event, const std::string &name, +Routine::Routine(Queue &queue, EventPointer event, const std::string &name, const std::vector &routines, const Precision precision): precision_(precision), routine_name_(name), @@ -117,7 +117,8 @@ StatusCode Routine::SetUp() { // Enqueues a kernel, waits for completion, and checks for errors template StatusCode Routine::RunKernel(Kernel &kernel, std::vector &global, - const std::vector &local) { + const std::vector &local, EventPointer event, + std::vector& waitForEvents) { // Tests for validity of the local thread sizes if (local.size() > max_work_item_dimensions_) { @@ -141,18 +142,21 @@ StatusCode Routine::RunKernel(Kernel &kernel, std::vector &global, // Launches the kernel (and checks for launch errors) try { - kernel.Launch(queue_, global, local, event_); + kernel.Launch(queue_, global, local, event, waitForEvents); } catch (...) { return StatusCode::kKernelLaunchError; } - // Waits for completion of the kernel - try { - queue_.Finish(event_); - } catch (...) { return StatusCode::kKernelRunError; } - // No errors, normal termination of this function return StatusCode::kSuccess; } +// As above, but without an event waiting list +template +StatusCode Routine::RunKernel(Kernel &kernel, std::vector &global, + const std::vector &local, EventPointer event) { + auto emptyWaitingList = std::vector(); + return RunKernel(kernel, global, local, event, emptyWaitingList); +} + // ================================================================================================= // Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a @@ -258,7 +262,8 @@ StatusCode Routine::TestVectorDot(const size_t n, const Buffer &buffer, co // Copies or transposes a matrix and pads/unpads it with zeros template -StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, +StatusCode Routine::PadCopyTransposeMatrix(EventPointer event, std::vector& waitForEvents, + const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer &src, const size_t dest_one, const size_t dest_two, @@ -340,13 +345,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t auto global = std::vector{dest_one / db_["TRA_WPT"], dest_two / db_["TRA_WPT"]}; auto local = std::vector{db_["TRA_DIM"], db_["TRA_DIM"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event, waitForEvents); } else { auto global = std::vector{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])}; auto local = std::vector{db_["PADTRA_TILE"], db_["PADTRA_TILE"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event, waitForEvents); } } else { @@ -354,13 +359,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t auto global = std::vector{dest_one / db_["COPY_VW"], dest_two / db_["COPY_WPT"]}; auto local = std::vector{db_["COPY_DIMX"], db_["COPY_DIMY"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event, waitForEvents); } else { auto global = std::vector{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event, waitForEvents); } } return status; diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index c5acaf49..37d23543 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -29,7 +29,7 @@ template <> const Precision Xaxpy::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xaxpy::Xaxpy(Queue &queue, Event &event, const std::string &name): +Xaxpy::Xaxpy(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" @@ -89,13 +89,13 @@ StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc index 8c7f8671..04508383 100644 --- a/src/routines/level1/xcopy.cc +++ b/src/routines/level1/xcopy.cc @@ -29,7 +29,7 @@ template <> const Precision Xcopy::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xcopy::Xcopy(Queue &queue, Event &event, const std::string &name): +Xcopy::Xcopy(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" @@ -87,13 +87,13 @@ StatusCode Xcopy::DoCopy(const size_t n, if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc index e22b0f8b..4813a004 100644 --- a/src/routines/level1/xdot.cc +++ b/src/routines/level1/xdot.cc @@ -29,7 +29,7 @@ template <> const Precision Xdot::precision_ = Precision::kComplexDoubl // Constructor: forwards to base class constructor template -Xdot::Xdot(Queue &queue, Event &event, const std::string &name): +Xdot::Xdot(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xdot"}, precision_) { source_string_ = #include "../../kernels/level1/xdot.opencl" @@ -78,11 +78,16 @@ StatusCode Xdot::DoDot(const size_t n, kernel1.SetArgument(7, temp_buffer()); kernel1.SetArgument(8, static_cast(do_conjugate)); + // Event waiting list + auto eventWaitList = std::vector(); + // Launches the main kernel auto global1 = std::vector{db_["WGS1"]*temp_size}; auto local1 = std::vector{db_["WGS1"]}; - status = RunKernel(kernel1, global1, local1); + auto kernelEvent = Event(); + status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); // Sets the arguments for the epilogue kernel kernel2.SetArgument(0, temp_buffer()); @@ -92,7 +97,7 @@ StatusCode Xdot::DoDot(const size_t n, // Launches the epilogue kernel auto global2 = std::vector{db_["WGS2"]}; auto local2 = std::vector{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2); + status = RunKernel(kernel2, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xdotc.cc b/src/routines/level1/xdotc.cc index f414f556..b3a01079 100644 --- a/src/routines/level1/xdotc.cc +++ b/src/routines/level1/xdotc.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xdotc::Xdotc(Queue &queue, Event &event, const std::string &name): +Xdotc::Xdotc(Queue &queue, EventPointer event, const std::string &name): Xdot(queue, event, name) { } diff --git a/src/routines/level1/xdotu.cc b/src/routines/level1/xdotu.cc index 28d9b730..8dded6e0 100644 --- a/src/routines/level1/xdotu.cc +++ b/src/routines/level1/xdotu.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xdotu::Xdotu(Queue &queue, Event &event, const std::string &name): +Xdotu::Xdotu(Queue &queue, EventPointer event, const std::string &name): Xdot(queue, event, name) { } diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index 685eb29f..04e4137c 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -29,7 +29,7 @@ template <> const Precision Xnrm2::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xnrm2::Xnrm2(Queue &queue, Event &event, const std::string &name): +Xnrm2::Xnrm2(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xdot"}, precision_) { source_string_ = #include "../../kernels/level1/xnrm2.opencl" @@ -69,12 +69,16 @@ StatusCode Xnrm2::DoNrm2(const size_t n, kernel1.SetArgument(2, static_cast(x_offset)); kernel1.SetArgument(3, static_cast(x_inc)); kernel1.SetArgument(4, temp_buffer()); + // Event waiting list + auto eventWaitList = std::vector(); // Launches the main kernel auto global1 = std::vector{db_["WGS1"]*temp_size}; auto local1 = std::vector{db_["WGS1"]}; - status = RunKernel(kernel1, global1, local1); + auto kernelEvent = Event(); + status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); // Sets the arguments for the epilogue kernel kernel2.SetArgument(0, temp_buffer()); @@ -84,7 +88,7 @@ StatusCode Xnrm2::DoNrm2(const size_t n, // Launches the epilogue kernel auto global2 = std::vector{db_["WGS2"]}; auto local2 = std::vector{db_["WGS2"]}; - status = RunKernel(kernel2, global2, local2); + status = RunKernel(kernel2, global2, local2, event_, eventWaitList); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc index 57bbe9e8..e83e73fd 100644 --- a/src/routines/level1/xscal.cc +++ b/src/routines/level1/xscal.cc @@ -29,7 +29,7 @@ template <> const Precision Xscal::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xscal::Xscal(Queue &queue, Event &event, const std::string &name): +Xscal::Xscal(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" @@ -81,13 +81,13 @@ StatusCode Xscal::DoScal(const size_t n, const T alpha, if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc index c986b3fb..bc425f40 100644 --- a/src/routines/level1/xswap.cc +++ b/src/routines/level1/xswap.cc @@ -29,7 +29,7 @@ template <> const Precision Xswap::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xswap::Xswap(Queue &queue, Event &event, const std::string &name): +Xswap::Xswap(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" @@ -87,13 +87,13 @@ StatusCode Xswap::DoSwap(const size_t n, if (use_fast_kernel) { auto global = std::vector{CeilDiv(n, db_["WPT"]*db_["VW"])}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } else { auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); auto global = std::vector{n_ceiled/db_["WPT"]}; auto local = std::vector{db_["WGS"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); } if (ErrorIn(status)) { return status; } diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cc index 14d391ca..f90e26b2 100644 --- a/src/routines/level2/xgbmv.cc +++ b/src/routines/level2/xgbmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xgbmv::Xgbmv(Queue &queue, Event &event, const std::string &name): +Xgbmv::Xgbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index bf7ae6fa..24e87db0 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -29,7 +29,7 @@ template <> const Precision Xgemv::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xgemv::Xgemv(Queue &queue, Event &event, const std::string &name): +Xgemv::Xgemv(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Pad", "Xgemv"}, precision_) { source_string_ = #include "../../kernels/level2/xgemv.opencl" @@ -162,7 +162,7 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, // Launches the kernel auto global = std::vector{global_size}; auto local = std::vector{local_size}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index 9ab21bfb..dda78232 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -29,7 +29,7 @@ template <> const Precision Xger::precision_ = Precision::kComplexDoubl // Constructor: forwards to base class constructor template -Xger::Xger(Queue &queue, Event &event, const std::string &name): +Xger::Xger(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xger"}, precision_) { source_string_ = #include "../../kernels/level2/level2.opencl" @@ -89,7 +89,7 @@ StatusCode Xger::DoGer(const Layout layout, auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]); auto global = std::vector{a_one_ceiled, a_two_ceiled}; auto local = std::vector{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xgerc.cc b/src/routines/level2/xgerc.cc index 09408898..73284b52 100644 --- a/src/routines/level2/xgerc.cc +++ b/src/routines/level2/xgerc.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xgerc::Xgerc(Queue &queue, Event &event, const std::string &name): +Xgerc::Xgerc(Queue &queue, EventPointer event, const std::string &name): Xger(queue, event, name) { } diff --git a/src/routines/level2/xgeru.cc b/src/routines/level2/xgeru.cc index 36fd9d0a..7730d6a5 100644 --- a/src/routines/level2/xgeru.cc +++ b/src/routines/level2/xgeru.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xgeru::Xgeru(Queue &queue, Event &event, const std::string &name): +Xgeru::Xgeru(Queue &queue, EventPointer event, const std::string &name): Xger(queue, event, name) { } diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cc index f59a7cb3..58591b50 100644 --- a/src/routines/level2/xhbmv.cc +++ b/src/routines/level2/xhbmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xhbmv::Xhbmv(Queue &queue, Event &event, const std::string &name): +Xhbmv::Xhbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cc index 5a58b28b..b4ef0fa4 100644 --- a/src/routines/level2/xhemv.cc +++ b/src/routines/level2/xhemv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xhemv::Xhemv(Queue &queue, Event &event, const std::string &name): +Xhemv::Xhemv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index 1aefa240..aba665b0 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -28,7 +28,7 @@ template <> const Precision Xher::precision_ = Precision::kComp // Constructor: forwards to base class constructor template -Xher::Xher(Queue &queue, Event &event, const std::string &name): +Xher::Xher(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xger"}, precision_) { source_string_ = #include "../../kernels/level2/level2.opencl" @@ -99,7 +99,7 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector{global_one, global_two}; auto local = std::vector{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index 364add12..bcd6488f 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -28,7 +28,7 @@ template <> const Precision Xher2::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xher2::Xher2(Queue &queue, Event &event, const std::string &name): +Xher2::Xher2(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Xger"}, precision_) { source_string_ = #include "../../kernels/level2/level2.opencl" @@ -91,7 +91,7 @@ StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]); auto global = std::vector{global_one, global_two}; auto local = std::vector{db_["WGS1"], db_["WGS2"]}; - status = RunKernel(kernel, global, local); + status = RunKernel(kernel, global, local, event_); if (ErrorIn(status)) { return status; } // Succesfully finished the computation diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cc index 2269255d..92686dbe 100644 --- a/src/routines/level2/xhpmv.cc +++ b/src/routines/level2/xhpmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xhpmv::Xhpmv(Queue &queue, Event &event, const std::string &name): +Xhpmv::Xhpmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } diff --git a/src/routines/level2/xhpr.cc b/src/routines/level2/xhpr.cc index b0cea72f..4b31ad09 100644 --- a/src/routines/level2/xhpr.cc +++ b/src/routines/level2/xhpr.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xhpr::Xhpr(Queue &queue, Event &event, const std::string &name): +Xhpr::Xhpr(Queue &queue, EventPointer event, const std::string &name): Xher(queue, event, name) { } diff --git a/src/routines/level2/xhpr2.cc b/src/routines/level2/xhpr2.cc index ded35e53..9be24f43 100644 --- a/src/routines/level2/xhpr2.cc +++ b/src/routines/level2/xhpr2.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xhpr2::Xhpr2(Queue &queue, Event &event, const std::string &name): +Xhpr2::Xhpr2(Queue &queue, EventPointer event, const std::string &name): Xher2(queue, event, name) { } diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cc index 457bd762..bc82c88d 100644 --- a/src/routines/level2/xsbmv.cc +++ b/src/routines/level2/xsbmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xsbmv::Xsbmv(Queue &queue, Event &event, const std::string &name): +Xsbmv::Xsbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cc index 4f1a9c61..6e00dcfa 100644 --- a/src/routines/level2/xspmv.cc +++ b/src/routines/level2/xspmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xspmv::Xspmv(Queue &queue, Event &event, const std::string &name): +Xspmv::Xspmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cc index 2d998e0b..55af2f29 100644 --- a/src/routines/level2/xspr.cc +++ b/src/routines/level2/xspr.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xspr::Xspr(Queue &queue, Event &event, const std::string &name): +Xspr::Xspr(Queue &queue, EventPointer event, const std::string &name): Xher(queue, event, name) { } diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cc index fd5232da..9a3f97ce 100644 --- a/src/routines/level2/xspr2.cc +++ b/src/routines/level2/xspr2.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xspr2::Xspr2(Queue &queue, Event &event, const std::string &name): +Xspr2::Xspr2(Queue &queue, EventPointer event, const std::string &name): Xher2(queue, event, name) { } diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cc index ec12324b..a9eb284f 100644 --- a/src/routines/level2/xsymv.cc +++ b/src/routines/level2/xsymv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xsymv::Xsymv(Queue &queue, Event &event, const std::string &name): +Xsymv::Xsymv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cc index c01fa2d3..4b3928e5 100644 --- a/src/routines/level2/xsyr.cc +++ b/src/routines/level2/xsyr.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xsyr::Xsyr(Queue &queue, Event &event, const std::string &name): +Xsyr::Xsyr(Queue &queue, EventPointer event, const std::string &name): Xher(queue, event, name) { } diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cc index 6db55085..3ae389e0 100644 --- a/src/routines/level2/xsyr2.cc +++ b/src/routines/level2/xsyr2.cc @@ -20,7 +20,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xsyr2::Xsyr2(Queue &queue, Event &event, const std::string &name): +Xsyr2::Xsyr2(Queue &queue, EventPointer event, const std::string &name): Xher2(queue, event, name) { } diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cc index 2e1aebff..47371c87 100644 --- a/src/routines/level2/xtbmv.cc +++ b/src/routines/level2/xtbmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xtbmv::Xtbmv(Queue &queue, Event &event, const std::string &name): +Xtbmv::Xtbmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cc index aa0e099b..c63cb9b2 100644 --- a/src/routines/level2/xtpmv.cc +++ b/src/routines/level2/xtpmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xtpmv::Xtpmv(Queue &queue, Event &event, const std::string &name): +Xtpmv::Xtpmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cc index 94424743..9111d41d 100644 --- a/src/routines/level2/xtrmv.cc +++ b/src/routines/level2/xtrmv.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xtrmv::Xtrmv(Queue &queue, Event &event, const std::string &name): +Xtrmv::Xtrmv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 5dc2ad7f..7557dcc3 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -29,7 +29,7 @@ template <> const Precision Xgemm::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xgemm::Xgemm(Queue &queue, Event &event, const std::string &name): +Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -122,30 +122,43 @@ StatusCode Xgemm::DoGemm(const Layout layout, auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, m_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector(); + auto emptyEventList = std::vector(); + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, m_ceiled, k_ceiled, m_ceiled, 0, a_temp, program, true, a_do_transpose, a_conjugate); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessA); } // As above, but now for matrix B if (!b_no_temp) { - status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, + auto eventProcessB = Event(); + status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + b_one, b_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, program, true, b_do_transpose, b_conjugate); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessB); } // As above, but now for matrix C. This is only necessary if C is used both as input and output. if (!c_no_temp && beta != static_cast(0)) { - status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + c_one, c_two, c_ld, c_offset, c_buffer, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, program, true, c_do_transpose, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessC); } // Retrieves the Xgemm kernel from the compiled binary @@ -170,12 +183,15 @@ StatusCode Xgemm::DoGemm(const Layout layout, auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel = Event(); + status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel); // Runs the post-processing kernel if needed if (!c_no_temp) { - status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + m_ceiled, n_ceiled, m_ceiled, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, program, false, c_do_transpose, false); if (ErrorIn(status)) { return status; } diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index bcc60dee..c0a4306a 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xhemm::Xhemm(Queue &queue, Event &event, const std::string &name): +Xhemm::Xhemm(Queue &queue, EventPointer event, const std::string &name): Xgemm(queue, event, name) { } @@ -79,9 +79,13 @@ StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); + auto kernelEvent = Event(); + status = RunKernel(kernel, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + // Runs the regular Xgemm code with either "C := AB+C" or ... if (side == Side::kLeft) { status = DoGemm(layout, Transpose::kNo, Transpose::kNo, diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 1711905d..4d5a4d35 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -27,7 +27,7 @@ template <> const Precision Xher2k::precision_ = Precision::kCom // Constructor: forwards to base class constructor template -Xher2k::Xher2k(Queue &queue, Event &event, const std::string &name): +Xher2k::Xher2k(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -112,39 +112,58 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector(); + auto emptyEventList = std::vector(); + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a1_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + auto eventProcessA1 = Event(); + status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, program, true, ab_rotated, ab_conjugate); + eventWaitList.push_back(eventProcessA1); if (ErrorIn(status)) { return status; } } if (!a2_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + auto eventProcessA2 = Event(); + status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, program, true, ab_rotated, !ab_conjugate); + eventWaitList.push_back(eventProcessA2); if (ErrorIn(status)) { return status; } } if (!b1_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + auto eventProcessB1 = Event(); + status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, program, true, ab_rotated, ab_conjugate); + eventWaitList.push_back(eventProcessB1); if (ErrorIn(status)) { return status; } } if (!b2_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + auto eventProcessB2 = Event(); + status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, program, true, ab_rotated, !ab_conjugate); + eventWaitList.push_back(eventProcessB2); if (ErrorIn(status)) { return status; } } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, program, true, c_rotated, false); + eventWaitList.push_back(eventProcessC); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -169,8 +188,10 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel1 = Event(); + status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel1); // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; @@ -181,13 +202,16 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co kernel.SetArgument(5, a2_temp()); // Runs the kernel again - status = RunKernel(kernel, global, local); + auto eventKernel2 = Event(); + status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel2); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, program, false, c_rotated, false, upper, lower, true); if (ErrorIn(status)) { return status; } diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index cbd0a188..574debe4 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -27,7 +27,7 @@ template <> const Precision Xherk::precision_ = Precision::kComp // Constructor: forwards to base class constructor template -Xherk::Xherk(Queue &queue, Event &event, const std::string &name): +Xherk::Xherk(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -103,27 +103,40 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector(); + auto emptyEventList = std::vector(); + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. Two copies are created. if (!a_no_temp) { - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, program, true, a_rotated, a_conjugate); + eventWaitList.push_back(eventProcessA); if (ErrorIn(status)) { return status; } } if (!b_no_temp) { - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + auto eventProcessB = Event(); + status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, program, true, a_rotated, b_conjugate); + eventWaitList.push_back(eventProcessB); if (ErrorIn(status)) { return status; } } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, program, true, c_rotated, false); + eventWaitList.push_back(eventProcessC); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -149,13 +162,16 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel = Event(); + status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, program, false, c_rotated, false, upper, lower, true); if (ErrorIn(status)) { return status; } diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index 583d5c7d..914a326a 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xsymm::Xsymm(Queue &queue, Event &event, const std::string &name): +Xsymm::Xsymm(Queue &queue, EventPointer event, const std::string &name): Xgemm(queue, event, name) { } @@ -79,9 +79,13 @@ StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); + auto kernelEvent = Event(); + status = RunKernel(kernel, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + // Runs the regular Xgemm code with either "C := AB+C" or ... if (side == Side::kLeft) { status = DoGemm(layout, Transpose::kNo, Transpose::kNo, diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 79090871..44d0024e 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -29,7 +29,7 @@ template <> const Precision Xsyr2k::precision_ = Precision::kComplexDou // Constructor: forwards to base class constructor template -Xsyr2k::Xsyr2k(Queue &queue, Event &event, const std::string &name): +Xsyr2k::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -104,28 +104,41 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector(); + auto emptyEventList = std::vector(); + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, program, true, ab_rotated, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessA); } if (!b_no_temp) { - status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + auto eventProcessB = Event(); + status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList, + ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, program, true, ab_rotated, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessB); } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, program, true, c_rotated, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary try { @@ -148,8 +161,10 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel1 = Event(); + status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel1); // Swaps the arguments for matrices A and B, and sets 'beta' to 1 auto one = static_cast(1); @@ -158,13 +173,16 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons kernel.SetArgument(5, a_temp()); // Runs the kernel again - status = RunKernel(kernel, global, local); + auto eventKernel2 = Event(); + status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel2); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, program, false, c_rotated, false, upper, lower, false); if (ErrorIn(status)) { return status; } diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index ca429bd7..44ed8d35 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -29,7 +29,7 @@ template <> const Precision Xsyrk::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template -Xsyrk::Xsyrk(Queue &queue, Event &event, const std::string &name): +Xsyrk::Xsyrk(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/level3/copy.opencl" @@ -97,22 +97,32 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, k_ceiled*n_ceiled); auto c_temp = Buffer(context_, n_ceiled*n_ceiled); + // Events of all kernels (including pre/post processing kernels) + auto eventWaitList = std::vector(); + auto emptyEventList = std::vector(); + // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { - status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + auto eventProcessA = Event(); + status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList, + a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, program, true, a_rotated, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessA); } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. - status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + auto eventProcessC = Event(); + status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList, + n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, program, true, c_rotated, false); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary try { @@ -135,17 +145,21 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel - status = RunKernel(kernel, global, local); + auto eventKernel = Event(); + status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); if (ErrorIn(status)) { return status; } + eventWaitList.push_back(eventKernel); // Runs the post-processing kernel auto upper = (triangle == Triangle::kUpper); auto lower = (triangle == Triangle::kLower); - status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp, + status = PadCopyTransposeMatrix(event_, eventWaitList, + n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, program, false, c_rotated, false, upper, lower, false); if (ErrorIn(status)) { return status; } + // Successfully finished the computation return StatusCode::kSuccess; } catch (...) { return StatusCode::kInvalidKernel; } diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 1180c026..484cf040 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template -Xtrmm::Xtrmm(Queue &queue, Event &event, const std::string &name): +Xtrmm::Xtrmm(Queue &queue, EventPointer event, const std::string &name): Xgemm(queue, event, name) { } @@ -82,9 +82,13 @@ StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle auto global = std::vector{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local); + auto kernelEvent = Event(); + status = RunKernel(kernel, global, local, kernelEvent.pointer()); if (ErrorIn(status)) { return status; } + // Synchronize now: 'DoGemm' does not accept a list of events to wait for + kernelEvent.WaitForCompletion(); + // Runs the regular Xgemm code with either "B := alpha*A*B" or ... if (side == Side::kLeft) { status = DoGemm(layout, a_transpose, Transpose::kNo, From a61724ece50ab895a67bc15ae3a132d0ecbe61bc Mon Sep 17 00:00:00 2001 From: cnugteren Date: Mon, 11 Apr 2016 22:27:44 -0600 Subject: [PATCH 25/60] Fixed the way the defaults are calculated in the database; added warning for non-matching tuner arguments --- include/internal/database/xaxpy.h | 4 ++-- include/internal/database/xgemm.h | 22 +++++++++++----------- scripts/database/database.py | 21 +++++++++++++++------ 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h index aa0c2d2f..71a4c7f2 100644 --- a/include/internal/database/xaxpy.h +++ b/include/internal/database/xaxpy.h @@ -171,12 +171,12 @@ const Database::DatabaseEntry Database::XaxpyDouble = { { "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } }, { "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, } }, } diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h index 45efa397..c9fe03e4 100644 --- a/include/internal/database/xgemm.h +++ b/include/internal/database/xgemm.h @@ -60,12 +60,12 @@ const Database::DatabaseEntry Database::XgemmSingle = { { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } }, { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, } @@ -79,7 +79,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, } }, { // ARM GPUs @@ -100,7 +100,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { kDeviceTypeGPU, "Intel", { { "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // Intel accelerators @@ -119,12 +119,12 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, } @@ -171,12 +171,12 @@ const Database::DatabaseEntry Database::XgemmDouble = { { "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, } @@ -190,7 +190,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = { kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // ARM GPUs @@ -222,12 +222,12 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = { { "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, } diff --git a/scripts/database/database.py b/scripts/database/database.py index b3f919ef..7fd8c4d8 100644 --- a/scripts/database/database.py +++ b/scripts/database/database.py @@ -34,9 +34,9 @@ DEVICENAME_DEFAULT = "default" # Attributes DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"] DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"] -KERNEL_ATTRIBUTES = ["precision", "kernel_family", - "arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"] -ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES +KERNEL_ATTRIBUTES = ["precision", "kernel_family"] +ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"] +ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES # OpenCL vendor names and their short name VENDOR_NAMES = { "device_vendor": { @@ -98,6 +98,10 @@ def RemoveEntriesByDevice(df, devicename): def GetEntriesByField(df, field, value): return df[df[field] == value] +def UpdateDatabase(df, condition, field, value): + df.loc[condition, field] = value + return df + # Fixes the problem that some vendors use multiple different names def SanitizeVendorNames(df): df = df.replace(VENDOR_NAMES) @@ -120,7 +124,7 @@ def CalculateDefaults(df): dfdefault = pd.DataFrame() # Defaults per type/vendor - groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"]) + groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"]) for name, dfgroup in groups: default_values = dfgroup.min(axis=0) default_values["device"] = DEVICENAME_DEFAULT @@ -129,8 +133,14 @@ def CalculateDefaults(df): default_values["time"] = 0.0 dfdefault = dfdefault.append(default_values, ignore_index=True) + # Checks for mis-matched arguments + groups = dfdefault.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"]) + for name, dfgroup in groups: + if len(dfgroup) != 1: + print("[WARNING] Entries for a single kernel with multiple argument values") + # Defaults in general - groups = df.groupby(KERNEL_ATTRIBUTES+["kernel"]) + groups = df.groupby(KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"]) for name, dfgroup in groups: default_values = dfgroup.min(axis=0) default_values["device_vendor"] = VENDOR_DEFAULT @@ -273,7 +283,6 @@ for file_json in glob.glob(glob_json): new_size = len(database.index) print("with "+str(new_size-old_size)+" new items") - # Stores the modified database back to disk if len(glob.glob(glob_json)) >= 1: print("## Storing the database to disk...") From e0497807e297e38884efae67a0109a160dc693b7 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Wed, 13 Apr 2016 21:44:49 -0600 Subject: [PATCH 26/60] Added prototype for xASUM routines --- include/clblast.h | 7 +++ include/clblast_c.h | 18 ++++++ scripts/generator/generator.py | 1 + scripts/generator/routine.py | 2 +- src/clblast.cc | 25 +++++++++ src/clblast_c.cc | 42 ++++++++++++++ test/correctness/routines/level1/xasum.cc | 28 ++++++++++ test/performance/routines/level1/xasum.cc | 35 ++++++++++++ test/wrapper_cblas.h | 26 +++++++++ test/wrapper_clblas.h | 68 +++++++++++++++++++++++ 10 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 test/correctness/routines/level1/xasum.cc create mode 100644 test/performance/routines/level1/xasum.cc diff --git a/include/clblast.h b/include/clblast.h index 431f2510..cf55a256 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -181,6 +181,13 @@ StatusCode Nrm2(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +template +StatusCode Asum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index f72cff3a..190e6a46 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -278,6 +278,24 @@ StatusCode PUBLIC_API CLBlastDznrm2(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +StatusCode PUBLIC_API CLBlastSasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastScasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDzasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 8cd35f95..e9e9276b 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -67,6 +67,7 @@ routines = [ Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors"), Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), Routine(True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), + Routine(False, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), ], [ # Level 2: matrix-vector Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index fffa19f6..7ddd7a12 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -72,7 +72,7 @@ class Routine(): # List of scalar buffers def ScalarBuffersFirst(self): - return ["dot","nrm2"] + return ["dot","nrm2","asum"] def ScalarBuffersSecond(self): return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] diff --git a/src/clblast.cc b/src/clblast.cc index 4f4b6078..4888faed 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -396,6 +396,31 @@ template StatusCode PUBLIC_API Nrm2(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +template +StatusCode Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 23e97bd5..c36edbca 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -433,6 +433,48 @@ StatusCode CLBlastDznrm2(const size_t n, return static_cast(status); } +// ASUM +StatusCode CLBlastSasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastScasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDzasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/test/correctness/routines/level1/xasum.cc b/test/correctness/routines/level1/xasum.cc new file mode 100644 index 00000000..5ec20596 --- /dev/null +++ b/test/correctness/routines/level1/xasum.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xasum.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "SASUM"); + clblast::RunTests, double, double>(argc, argv, true, "DASUM"); + clblast::RunTests, float2, float2>(argc, argv, true, "ScASUM"); + clblast::RunTests, double2, double2>(argc, argv, true, "DzASUM"); + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xasum.cc b/test/performance/routines/level1/xasum.cc new file mode 100644 index 00000000..2680966e --- /dev/null +++ b/test/performance/routines/level1/xasum.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xasum.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index dec272b0..af0eec9b 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -345,6 +345,32 @@ void cblasXnrm2(const size_t n, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } +// Forwards the Netlib BLAS calls for SASUM/DASUM/ScASUM/DzASUM +void cblasXasum(const size_t n, + std::vector& asum_buffer, const size_t asum_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + asum_buffer[asum_offset] = cblas_sasum(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXasum(const size_t n, + std::vector& asum_buffer, const size_t asum_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + asum_buffer[asum_offset] = cblas_dasum(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXasum(const size_t n, + std::vector& asum_buffer, const size_t asum_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + asum_buffer[asum_offset] = cblas_scasum(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXasum(const size_t n, + std::vector& asum_buffer, const size_t asum_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + asum_buffer[asum_offset] = cblas_dzasum(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 89b708b8..09b3310b 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -558,6 +558,74 @@ clblasStatus clblasXnrm2(const size_t n, num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for SASUM/DASUM/ScASUM/DzASUM +template +clblasStatus clblasXasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasSasum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasDasum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasScasum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasDzasum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= From 8be99de82d2ff0634c1289d9b4d1785364a68a44 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Thu, 14 Apr 2016 19:58:26 -0600 Subject: [PATCH 27/60] Added support for the SASUM/DASUM/ScASUM/DzASUM routines --- CMakeLists.txt | 2 +- README.md | 2 +- include/internal/routines/level1/xasum.h | 56 +++++++++ include/internal/utilities.h | 2 + scripts/generator/generator.py | 4 +- src/clblast.cc | 17 ++- src/kernels/common.opencl | 7 ++ src/kernels/level1/xasum.opencl | 108 ++++++++++++++++++ src/kernels/level1/xnrm2.opencl | 10 +- src/routines/level1/xasum.cc | 109 ++++++++++++++++++ src/routines/level1/xnrm2.cc | 1 + test/correctness/testblas.h | 12 +- test/performance/client.cc | 5 +- test/routines/level1/xasum.h | 139 +++++++++++++++++++++++ 14 files changed, 455 insertions(+), 19 deletions(-) create mode 100644 include/internal/routines/level1/xasum.h create mode 100644 src/kernels/level1/xasum.opencl create mode 100644 src/routines/level1/xasum.cc create mode 100644 test/routines/level1/xasum.h diff --git a/CMakeLists.txt b/CMakeLists.txt index a4eb5b85..2d60dc88 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,7 +121,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS}) set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_C sgemm) -set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2) +set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) diff --git a/README.md b/README.md index 74d8c9cc..e786cc97 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,7 @@ CLBlast is in active development but already supports almost all the BLAS routin | xDOTU | - | - | ✔ | ✔ | | | xDOTC | - | - | ✔ | ✔ | | | xNRM2 | ✔ | ✔ | ✔ | ✔ | | -| xASUM | | | | | | +| xASUM | ✔ | ✔ | ✔ | ✔ | | | IxAMAX | | | | | | | Level-2 | S | D | C | Z | Notes | diff --git a/include/internal/routines/level1/xasum.h b/include/internal/routines/level1/xasum.h new file mode 100644 index 00000000..b6e5d2cd --- /dev/null +++ b/include/internal/routines/level1/xasum.h @@ -0,0 +1,56 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xasum routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XASUM_H_ +#define CLBLAST_ROUTINES_XASUM_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xasum: public Routine { + public: + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::event_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::TestVectorDot; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM"); + + // Templated-precision implementation of the routine + StatusCode DoAsum(const size_t n, + const Buffer &asum_buffer, const size_t asum_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XASUM_H_ +#endif diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 6adc1d0a..8ffdc3af 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -65,6 +65,7 @@ constexpr auto kArgCOffset = "offc"; constexpr auto kArgAPOffset = "offap"; constexpr auto kArgDotOffset = "offdot"; constexpr auto kArgNrm2Offset = "offnrm2"; +constexpr auto kArgAsumOffset = "offasum"; constexpr auto kArgAlpha = "alpha"; constexpr auto kArgBeta = "beta"; @@ -119,6 +120,7 @@ struct Arguments { size_t ap_offset = 0; size_t dot_offset = 0; size_t nrm2_offset = 0; + size_t asum_offset = 0; T alpha = T{1.0}; T beta = T{1.0}; size_t x_size = 1; diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index e9e9276b..9b1b2aa1 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -67,7 +67,7 @@ routines = [ Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors"), Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), Routine(True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), - Routine(False, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), + Routine(True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), ], [ # Level 2: matrix-vector Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), @@ -289,7 +289,7 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 65, 93, 22, 22, 38] +header_lines = [84, 66, 93, 22, 22, 38] footer_lines = [6, 3, 9, 2, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/src/clblast.cc b/src/clblast.cc index 4888faed..7210ad1d 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -27,6 +27,7 @@ #include "internal/routines/level1/xdotu.h" #include "internal/routines/level1/xdotc.h" #include "internal/routines/level1/xnrm2.h" +#include "internal/routines/level1/xasum.h" // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" @@ -398,11 +399,17 @@ template StatusCode PUBLIC_API Nrm2(const size_t, // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM template -StatusCode Asum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Asum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xasum(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoAsum(n, + Buffer(asum_buffer), asum_offset, + Buffer(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index f2a2e7a7..0a68defb 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -109,6 +109,13 @@ R"( #define SetToOne(a) a = ONE #endif +// The absolute value (component-wise) +#if PRECISION == 3232 || PRECISION == 6464 + #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y) +#else + #define AbsoluteValue(value) value = fabs(value) +#endif + // Adds two complex variables #if PRECISION == 3232 || PRECISION == 6464 #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl new file mode 100644 index 00000000..037dc57e --- /dev/null +++ b/src/kernels/level1/xasum.opencl @@ -0,0 +1,108 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Xasum kernel. It implements a absolute sum computation using reduction +// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is loaded, +// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel +// is executed with a single workgroup only, computing the final result. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef WGS1 + #define WGS1 64 // The local work-group size of the main kernel +#endif +#ifndef WGS2 + #define WGS2 64 // The local work-group size of the epilogue kernel +#endif + +// ================================================================================================= + +// The main reduction kernel, performing the loading and the majority of the operation +__attribute__((reqd_work_group_size(WGS1, 1, 1))) +__kernel void Xasum(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* output) { + __local real lm[WGS1]; + const int lid = get_local_id(0); + const int wgid = get_group_id(0); + const int num_groups = get_num_groups(0); + + // Performs loading and the first steps of the reduction + real acc; + SetToZero(acc); + int id = wgid*WGS1 + lid; + while (id < n) { + real x = xgm[id*x_inc + x_offset]; + AbsoluteValue(x); + Add(acc, acc, x); + id += WGS1*num_groups; + } + lm[lid] = acc; + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS1/2; s>0; s=s>>1) { + if (lid < s) { + Add(lm[lid], lm[lid], lm[lid + s]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the per-workgroup result + if (lid == 0) { + output[wgid] = lm[0]; + } +} + +// ================================================================================================= + +// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to +// be launched with a single workgroup only. +__attribute__((reqd_work_group_size(WGS2, 1, 1))) +__kernel void XasumEpilogue(const __global real* restrict input, + __global real* asum, const int asum_offset) { + __local real lm[WGS2]; + const int lid = get_local_id(0); + + // Performs the first step of the reduction while loading the data + Add(lm[lid], input[lid], input[lid + WGS2]); + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS2/2; s>0; s=s>>1) { + if (lid < s) { + Add(lm[lid], lm[lid], lm[lid + s]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Computes the absolute value and stores the final result + if (lid == 0) { + #if PRECISION == 3232 || PRECISION == 6464 + asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number + #else + asum[asum_offset] = lm[0]; + #endif + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl index cf579457..9803687a 100644 --- a/src/kernels/level1/xnrm2.opencl +++ b/src/kernels/level1/xnrm2.opencl @@ -7,9 +7,9 @@ // Author(s): // Cedric Nugteren // -// This file contains the Xnrm2 kernel. It implements a dot-product computation using reduction -// kernels. Reduction is split in two parts. In the first (main) kernel the X and Y vectors are -// multiplied, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel +// This file contains the Xnrm2 kernel. It implements a squared norm computation using reduction +// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is squared, +// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel // is executed with a single workgroup only, computing the final result. // // ================================================================================================= @@ -29,7 +29,7 @@ R"( // ================================================================================================= -// The main reduction kernel, performing the multiplication and the majority of the sum operation +// The main reduction kernel, performing the multiplication and the majority of the operation __attribute__((reqd_work_group_size(WGS1, 1, 1))) __kernel void Xnrm2(const int n, const __global real* restrict xgm, const int x_offset, const int x_inc, @@ -70,7 +70,7 @@ __kernel void Xnrm2(const int n, // ================================================================================================= -// The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to +// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to // be launched with a single workgroup only. __attribute__((reqd_work_group_size(WGS2, 1, 1))) __kernel void Xnrm2Epilogue(const __global real* restrict input, diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc new file mode 100644 index 00000000..5799e25a --- /dev/null +++ b/src/routines/level1/xasum.cc @@ -0,0 +1,109 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xasum class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xasum.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xasum::precision_ = Precision::kSingle; +template <> const Precision Xasum::precision_ = Precision::kDouble; +template <> const Precision Xasum::precision_ = Precision::kComplexSingle; +template <> const Precision Xasum::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xasum::Xasum(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xdot"}, precision_) { + source_string_ = + #include "../../kernels/level1/xasum.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xasum::DoAsum(const size_t n, + const Buffer &asum_buffer, const size_t asum_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorDot(1, asum_buffer, asum_offset, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Retrieves the Xasum kernels from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel1 = Kernel(program, "Xasum"); + auto kernel2 = Kernel(program, "XasumEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer = Buffer(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast(x_offset)); + kernel1.SetArgument(3, static_cast(x_inc)); + kernel1.SetArgument(4, temp_buffer()); + + // Event waiting list + auto eventWaitList = std::vector(); + + // Launches the main kernel + auto global1 = std::vector{db_["WGS1"]*temp_size}; + auto local1 = std::vector{db_["WGS1"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer()); + kernel2.SetArgument(1, asum_buffer()); + kernel2.SetArgument(2, static_cast(asum_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector{db_["WGS2"]}; + auto local2 = std::vector{db_["WGS2"]}; + status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xasum; +template class Xasum; +template class Xasum; +template class Xasum; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index 04e4137c..ceabe586 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -69,6 +69,7 @@ StatusCode Xnrm2::DoNrm2(const size_t n, kernel1.SetArgument(2, static_cast(x_offset)); kernel1.SetArgument(3, static_cast(x_inc)); kernel1.SetArgument(4, temp_buffer()); + // Event waiting list auto eventWaitList = std::vector(); diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 8181aaf6..aa61c2ec 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -153,6 +153,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto ap_offsets = std::vector{args.ap_offset}; auto dot_offsets = std::vector{args.dot_offset}; auto nrm2_offsets = std::vector{args.nrm2_offset}; + auto asum_offsets = std::vector{args.asum_offset}; auto alphas = std::vector{args.alpha}; auto betas = std::vector{args.beta}; auto x_sizes = std::vector{args.x_size}; @@ -193,6 +194,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgAPOffset) { ap_offsets = tester.kOffsets; } if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; } if (option == kArgNrm2Offset) { nrm2_offsets = tester.kOffsets; } + if (option == kArgAsumOffset) { asum_offsets = tester.kOffsets; } if (option == kArgAlpha) { alphas = tester.kAlphaValues; } if (option == kArgBeta) { betas = tester.kBetaValues; } @@ -233,10 +235,12 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name for (auto &ap_offset: ap_offsets) { r_args.ap_offset = ap_offset; for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; for (auto &nrm2_offset: nrm2_offsets) { r_args.nrm2_offset = nrm2_offset; - for (auto &alpha: alphas) { r_args.alpha = alpha; - for (auto &beta: betas) { r_args.beta = beta; - C::SetSizes(r_args); - regular_test_vector.push_back(r_args); + for (auto &asum_offset: asum_offsets) { r_args.asum_offset = asum_offset; + for (auto &alpha: alphas) { r_args.alpha = alpha; + for (auto &beta: betas) { r_args.beta = beta; + C::SetSizes(r_args); + regular_test_vector.push_back(r_args); + } } } } diff --git a/test/performance/client.cc b/test/performance/client.cc index 56ab8c8d..f22c9666 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -80,8 +80,10 @@ Arguments Client::ParseArguments(int argc, char *argv[], const GetMetric if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); } if (o == kArgAPOffset) { args.ap_offset= GetArgument(argc, argv, help, kArgAPOffset, size_t{0}); } - // Dot arguments + // Scalar result arguments if (o == kArgDotOffset) { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); } + if (o == kArgNrm2Offset) { args.nrm2_offset = GetArgument(argc, argv, help, kArgNrm2Offset, size_t{0}); } + if (o == kArgAsumOffset) { args.asum_offset = GetArgument(argc, argv, help, kArgAsumOffset, size_t{0}); } // Scalar values if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar()); } @@ -292,6 +294,7 @@ void Client::PrintTableRow(const Arguments& args, else if (o == kArgAPOffset) { integers.push_back(args.ap_offset); } else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } + else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); } } auto strings = std::vector{}; for (auto &o: options_) { diff --git a/test/routines/level1/xasum.h b/test/routines/level1/xasum.h new file mode 100644 index 00000000..6eae3c83 --- /dev/null +++ b/test/routines/level1/xasum.h @@ -0,0 +1,139 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xasum routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XASUM_H_ +#define CLBLAST_TEST_ROUTINES_XASUM_H_ + +#include +#include + +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXasum { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, + kArgXOffset, kArgAsumOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeAsum(const Arguments &args) { + return 1 + args.asum_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.scalar_size = GetSizeAsum(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Asum(args.n, + buffers.scalar(), args.asum_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXasum(args.n, + buffers.scalar(), args.asum_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXasum(args.n, + scalar_cpu, args.asum_offset, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.scalar_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { + return args.asum_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((args.n) + 1) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XASUM_H_ +#endif From 5a4f8217be97575daf4f0f97d8ae7f8cf7bbbcd0 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Thu, 14 Apr 2016 21:37:52 -0600 Subject: [PATCH 28/60] Updated the reduction-kernel tuner to also tune the epilogue --- CHANGELOG | 1 + src/tuning/xdot.cc | 68 +++++++++++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index db14f037..9c954761 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,7 @@ Development version (next release) - Fixed the use of events within the library - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 + * SASUM/DASUM/ScASUM/DzASUM Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 diff --git a/src/tuning/xdot.cc b/src/tuning/xdot.cc index ff6bee16..48fa800b 100644 --- a/src/tuning/xdot.cc +++ b/src/tuning/xdot.cc @@ -22,13 +22,13 @@ namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class -template +template class TuneXdot { public: // The representative kernel and the source code - static std::string KernelFamily() { return "xdot"; } - static std::string KernelName() { return "Xdot"; } + static std::string KernelFamily() { return "xdot_"+std::to_string(V); } + static std::string KernelName() { return (V==1) ? "Xdot" : "XdotEpilogue"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" @@ -44,7 +44,7 @@ class TuneXdot { // Sets the default values for the arguments static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 4096*1024; } + static size_t DefaultN() { return 64*1024*1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel @@ -58,9 +58,7 @@ class TuneXdot { // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS1", {32, 64, 128, 256, 512, 1024}); - tuner.AddParameter(id, "WGS2", {32, 64, 128, 256, 512, 1024}); - tuner.AddParameter(id, "VW", {1}); + tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}); } // Sets the constraints and local memory size @@ -68,16 +66,16 @@ class TuneXdot { static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &) { return {2}; } - static std::vector GlobalSizeRef(const Arguments &) { return {2*64*64}; } + static std::vector GlobalSize(const Arguments &) { return (V==1) ? std::vector{2*64} : std::vector{1}; } + static std::vector GlobalSizeRef(const Arguments &) { return (V==1) ? std::vector{2*64*64} : std::vector{64}; } static std::vector LocalSize() { return {1}; } static std::vector LocalSizeRef() { return {64}; } // Transforms the thread configuration based on the parameters using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS1"}}; } + static TransformVector MulLocal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {{"WGS1"},{"WGS2"}}; } + static TransformVector MulGlobal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } static TransformVector DivGlobal() { return {}; } // Sets the kernel's arguments @@ -85,22 +83,29 @@ class TuneXdot { std::vector &x_vec, std::vector &y_vec, std::vector &, std::vector &, std::vector &, std::vector &temp) { - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(y_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(temp); // No output checking for the result - size varies - tuner.AddArgumentScalar(static_cast(false)); + if (V == 1) { + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentInput(x_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentInput(y_vec); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(1); + tuner.AddArgumentInput(temp); // No output checking for the result - size varies + tuner.AddArgumentScalar(static_cast(false)); + } + else { + tuner.AddArgumentInput(temp); + tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere + tuner.AddArgumentScalar(0); + } } // Describes how to compute the performance metrics static size_t GetMetric(const Arguments &args) { - return (2*args.n + 1) * GetBytes(args.precision); + return (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); } - static std::string PerformanceUnit() { return "GB/s"; } + static std::string PerformanceUnit() { return (V==1) ? "GB/s" : "N/A"; } }; // ================================================================================================= @@ -110,15 +115,22 @@ class TuneXdot { using float2 = clblast::float2; using double2 = clblast::double2; -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { +// Function to tune a specific variation V (not within the clblast namespace) +template +void StartVariation(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; } +} + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); return 0; } From 894983fc3c7c57ffc48c21523641694cde318eca Mon Sep 17 00:00:00 2001 From: cnugteren Date: Wed, 20 Apr 2016 21:11:33 -0600 Subject: [PATCH 29/60] Added prototype for ixAMAX routines --- include/clblast.h | 7 +++ include/clblast_c.h | 18 ++++++ scripts/generator/generator.py | 7 ++- scripts/generator/routine.py | 2 +- src/clblast.cc | 25 +++++++++ src/clblast_c.cc | 42 ++++++++++++++ test/correctness/routines/level1/xamax.cc | 28 ++++++++++ test/performance/routines/level1/xamax.cc | 35 ++++++++++++ test/wrapper_cblas.h | 26 +++++++++ test/wrapper_clblas.h | 68 +++++++++++++++++++++++ 10 files changed, 256 insertions(+), 2 deletions(-) create mode 100644 test/correctness/routines/level1/xamax.cc create mode 100644 test/performance/routines/level1/xamax.cc diff --git a/include/clblast.h b/include/clblast.h index cf55a256..57948581 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -188,6 +188,13 @@ StatusCode Asum(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); +// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +template +StatusCode Amax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index 190e6a46..92f4afe5 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -296,6 +296,24 @@ StatusCode PUBLIC_API CLBlastDzasum(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +StatusCode PUBLIC_API CLBlastiSamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiDamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiCamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiZamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 9b1b2aa1..3a845686 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -40,6 +40,10 @@ Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6 # Special cases Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output +iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output +iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output +iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output +iZ = DataType("Z", "iZ", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # As Z, but with integer output Css = DataType("C", "C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S Zdd = DataType("Z", "Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S @@ -68,6 +72,7 @@ routines = [ Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), Routine(True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), Routine(True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), + Routine(False, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "n", "Index of absolute maxium value in a vector"), ], [ # Level 2: matrix-vector Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), @@ -369,7 +374,7 @@ for level in [1,2,3]: body += "using double2 = clblast::double2;\n\n" body += "// Main function (not within the clblast namespace)\n" body += "int main(int argc, char *argv[]) {\n" - default = PrecisionToFullName(routine.flavours[0].name) + default = PrecisionToFullName(routine.flavours[0].precision_name) body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n" for precision in ["H","S","D","C","Z"]: body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":" diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 7ddd7a12..9806d960 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -72,7 +72,7 @@ class Routine(): # List of scalar buffers def ScalarBuffersFirst(self): - return ["dot","nrm2","asum"] + return ["dot","nrm2","asum","imax"] def ScalarBuffersSecond(self): return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] diff --git a/src/clblast.cc b/src/clblast.cc index 7210ad1d..bee63b53 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -428,6 +428,31 @@ template StatusCode PUBLIC_API Asum(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +template +StatusCode Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/src/clblast_c.cc b/src/clblast_c.cc index c36edbca..23c96feb 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -475,6 +475,48 @@ StatusCode CLBlastDzasum(const size_t n, return static_cast(status); } +// AMAX +StatusCode CLBlastiSamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiDamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiCamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiZamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/test/correctness/routines/level1/xamax.cc b/test/correctness/routines/level1/xamax.cc new file mode 100644 index 00000000..ade09e7a --- /dev/null +++ b/test/correctness/routines/level1/xamax.cc @@ -0,0 +1,28 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "correctness/testblas.h" +#include "routines/level1/xamax.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::RunTests, float, float>(argc, argv, false, "iSAMAX"); + clblast::RunTests, double, double>(argc, argv, true, "iDAMAX"); + clblast::RunTests, float2, float2>(argc, argv, true, "iCAMAX"); + clblast::RunTests, double2, double2>(argc, argv, true, "iZAMAX"); + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/level1/xamax.cc b/test/performance/routines/level1/xamax.cc new file mode 100644 index 00000000..85caa483 --- /dev/null +++ b/test/performance/routines/level1/xamax.cc @@ -0,0 +1,35 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "performance/client.h" +#include "routines/level1/xamax.h" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index af0eec9b..994b48b1 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -371,6 +371,32 @@ void cblasXasum(const size_t n, reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } +// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX +void cblasXamax(const size_t n, + std::vector& imax_buffer, const size_t imax_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + imax_buffer[imax_offset] = cblas_isamax(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXamax(const size_t n, + std::vector& imax_buffer, const size_t imax_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + imax_buffer[imax_offset] = cblas_idamax(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXamax(const size_t n, + std::vector& imax_buffer, const size_t imax_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + imax_buffer[imax_offset] = cblas_icamax(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXamax(const size_t n, + std::vector& imax_buffer, const size_t imax_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + imax_buffer[imax_offset] = cblas_izamax(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 09b3310b..955dc3ad 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -626,6 +626,74 @@ clblasStatus clblasXasum(const size_t n, num_queues, queues, num_wait_events, wait_events, events); } +// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX +template +clblasStatus clblasXamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); +template <> +clblasStatus clblasXamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasiSamax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasiDamax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasiCamax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} +template <> +clblasStatus clblasXamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto queue = Queue(queues[0]); + auto context = queue.GetContext(); + auto scratch_buffer = Buffer(context, n); + return clblasiZamax(n, + imax_buffer, imax_offset, + x_buffer, x_offset, static_cast(x_inc), + scratch_buffer(), + num_queues, queues, num_wait_events, wait_events, events); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= From 16a048f1ac8102ad4bcce9cf9fc320f791323e45 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Wed, 20 Apr 2016 22:12:51 -0600 Subject: [PATCH 30/60] Added support for the iSAMAX/iDAMAX/iCAMAX/iZAMAX routines --- CHANGELOG | 1 + CMakeLists.txt | 2 +- README.md | 2 +- include/internal/routines/level1/xamax.h | 56 +++++++++ include/internal/utilities.h | 2 + scripts/generator/generator.py | 4 +- src/clblast.cc | 17 ++- src/kernels/common.opencl | 9 ++ src/kernels/level1/xamax.opencl | 128 +++++++++++++++++++++ src/routines/level1/xamax.cc | 112 ++++++++++++++++++ test/correctness/testblas.h | 12 +- test/performance/client.cc | 2 + test/routines/level1/xamax.h | 139 +++++++++++++++++++++++ test/wrapper_clblas.h | 8 +- 14 files changed, 477 insertions(+), 17 deletions(-) create mode 100644 include/internal/routines/level1/xamax.h create mode 100644 src/kernels/level1/xamax.opencl create mode 100644 src/routines/level1/xamax.cc create mode 100644 test/routines/level1/xamax.h diff --git a/CHANGELOG b/CHANGELOG index 9c954761..c9770dc2 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,7 @@ Development version (next release) - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 * SASUM/DASUM/ScASUM/DzASUM + * iSAMAX/iDAMAX/iCAMAX/iZAMAX Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d60dc88..efdf6be0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,7 +121,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS}) set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_C sgemm) -set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum) +set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) diff --git a/README.md b/README.md index e786cc97..b4f0981f 100644 --- a/README.md +++ b/README.md @@ -186,7 +186,7 @@ CLBlast is in active development but already supports almost all the BLAS routin | xDOTC | - | - | ✔ | ✔ | | | xNRM2 | ✔ | ✔ | ✔ | ✔ | | | xASUM | ✔ | ✔ | ✔ | ✔ | | -| IxAMAX | | | | | | +| IxAMAX | ✔ | ✔ | ✔ | ✔ | | | Level-2 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| diff --git a/include/internal/routines/level1/xamax.h b/include/internal/routines/level1/xamax.h new file mode 100644 index 00000000..b815e8d2 --- /dev/null +++ b/include/internal/routines/level1/xamax.h @@ -0,0 +1,56 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xamax routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XAMAX_H_ +#define CLBLAST_ROUTINES_XAMAX_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xamax: public Routine { + public: + + // Members and methods from the base class + using Routine::db_; + using Routine::source_string_; + using Routine::queue_; + using Routine::event_; + using Routine::context_; + using Routine::GetProgramFromCache; + using Routine::TestVectorX; + using Routine::TestVectorDot; + using Routine::RunKernel; + using Routine::ErrorIn; + + // Constructor + Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX"); + + // Templated-precision implementation of the routine + StatusCode DoAmax(const size_t n, + const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XAMAX_H_ +#endif diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 8ffdc3af..75b3d27d 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -66,6 +66,7 @@ constexpr auto kArgAPOffset = "offap"; constexpr auto kArgDotOffset = "offdot"; constexpr auto kArgNrm2Offset = "offnrm2"; constexpr auto kArgAsumOffset = "offasum"; +constexpr auto kArgImaxOffset = "offimax"; constexpr auto kArgAlpha = "alpha"; constexpr auto kArgBeta = "beta"; @@ -121,6 +122,7 @@ struct Arguments { size_t dot_offset = 0; size_t nrm2_offset = 0; size_t asum_offset = 0; + size_t imax_offset = 0; T alpha = T{1.0}; T beta = T{1.0}; size_t x_size = 1; diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 3a845686..d8bd4e2c 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -72,7 +72,7 @@ routines = [ Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), Routine(True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), Routine(True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), - Routine(False, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "n", "Index of absolute maxium value in a vector"), + Routine(True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maxium value in a vector"), ], [ # Level 2: matrix-vector Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), @@ -294,7 +294,7 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 66, 93, 22, 22, 38] +header_lines = [84, 67, 93, 22, 22, 38] footer_lines = [6, 3, 9, 2, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/src/clblast.cc b/src/clblast.cc index bee63b53..145b6bf6 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -28,6 +28,7 @@ #include "internal/routines/level1/xdotc.h" #include "internal/routines/level1/xnrm2.h" #include "internal/routines/level1/xasum.h" +#include "internal/routines/level1/xamax.h" // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" @@ -430,11 +431,17 @@ template StatusCode PUBLIC_API Asum(const size_t, // Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX template -StatusCode Amax(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Amax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xamax(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoAmax(n, + Buffer(imax_buffer), imax_offset, + Buffer(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 0a68defb..57d75ee0 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -80,6 +80,15 @@ R"( #define ONE 1.0 #endif +// Single-element version of a complex number +#if PRECISION == 3232 + typedef float singlereal; +#elif PRECISION == 6464 + typedef double singlereal; +#else + typedef real singlereal; +#endif + // ================================================================================================= // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl new file mode 100644 index 00000000..03dd05e5 --- /dev/null +++ b/src/kernels/level1/xamax.opencl @@ -0,0 +1,128 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Xamax kernel. It implements an index of absolute max computation using +// reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is +// loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel +// is executed with a single workgroup only, computing the final result. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef WGS1 + #define WGS1 64 // The local work-group size of the main kernel +#endif +#ifndef WGS2 + #define WGS2 64 // The local work-group size of the epilogue kernel +#endif + +// ================================================================================================= + +// The main reduction kernel, performing the loading and the majority of the operation +__attribute__((reqd_work_group_size(WGS1, 1, 1))) +__kernel void Xamax(const int n, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global singlereal* maxgm, __global unsigned int* imaxgm) { + __local singlereal maxlm[WGS1]; + __local unsigned int imaxlm[WGS1]; + const int lid = get_local_id(0); + const int wgid = get_group_id(0); + const int num_groups = get_num_groups(0); + + // Performs loading and the first steps of the reduction + singlereal max = ZERO; + unsigned int imax = 0; + int id = wgid*WGS1 + lid; + while (id < n) { + #if PRECISION == 3232 || PRECISION == 6464 + singlereal x = fabs(xgm[id*x_inc + x_offset].x); + #else + singlereal x = fabs(xgm[id*x_inc + x_offset]); + #endif + if (x >= max) { + max = x; + imax = id*x_inc + x_offset; + } + id += WGS1*num_groups; + } + maxlm[lid] = max; + imaxlm[lid] = imax; + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS1/2; s>0; s=s>>1) { + if (lid < s) { + if (maxlm[lid + s] >= maxlm[lid]) { + maxlm[lid] = maxlm[lid + s]; + imaxlm[lid] = imaxlm[lid + s]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the per-workgroup result + if (lid == 0) { + maxgm[wgid] = maxlm[0]; + imaxgm[wgid] = imaxlm[0]; + } +} + +// ================================================================================================= + +// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to +// be launched with a single workgroup only. +__attribute__((reqd_work_group_size(WGS2, 1, 1))) +__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm, + const __global unsigned int* restrict imaxgm, + __global unsigned int* imax, const int imax_offset) { + __local singlereal maxlm[WGS2]; + __local unsigned int imaxlm[WGS2]; + const int lid = get_local_id(0); + + // Performs the first step of the reduction while loading the data + if (maxgm[lid + WGS2] >= maxgm[lid]) { + maxlm[lid] = maxgm[lid + WGS2]; + imaxlm[lid] = imaxgm[lid + WGS2]; + } + else { + maxlm[lid] = maxgm[lid]; + imaxlm[lid] = imaxgm[lid]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Performs reduction in local memory + #pragma unroll + for (int s=WGS2/2; s>0; s=s>>1) { + if (lid < s) { + if (maxlm[lid + s] >= maxlm[lid]) { + maxlm[lid] = maxlm[lid + s]; + imaxlm[lid] = imaxlm[lid + s]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores the final result + if (lid == 0) { + imax[imax_offset] = imaxlm[0]; + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc new file mode 100644 index 00000000..ffdfa496 --- /dev/null +++ b/src/routines/level1/xamax.cc @@ -0,0 +1,112 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xamax class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/level1/xamax.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xamax::precision_ = Precision::kSingle; +template <> const Precision Xamax::precision_ = Precision::kDouble; +template <> const Precision Xamax::precision_ = Precision::kComplexSingle; +template <> const Precision Xamax::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xamax::Xamax(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xdot"}, precision_) { + source_string_ = + #include "../../kernels/level1/xamax.opencl" + ; +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xamax::DoAmax(const size_t n, + const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { return StatusCode::kInvalidDimension; } + + // Tests the vectors for validity + auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestVectorDot(1, imax_buffer, imax_offset, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Retrieves the Xamax kernels from the compiled binary + try { + auto& program = GetProgramFromCache(); + auto kernel1 = Kernel(program, "Xamax"); + auto kernel2 = Kernel(program, "XamaxEpilogue"); + + // Creates the buffer for intermediate values + auto temp_size = 2*db_["WGS2"]; + auto temp_buffer1 = Buffer(context_, temp_size); + auto temp_buffer2 = Buffer(context_, temp_size); + + // Sets the kernel arguments + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, x_buffer()); + kernel1.SetArgument(2, static_cast(x_offset)); + kernel1.SetArgument(3, static_cast(x_inc)); + kernel1.SetArgument(4, temp_buffer1()); + kernel1.SetArgument(5, temp_buffer2()); + + // Event waiting list + auto eventWaitList = std::vector(); + + // Launches the main kernel + auto global1 = std::vector{db_["WGS1"]*temp_size}; + auto local1 = std::vector{db_["WGS1"]}; + auto kernelEvent = Event(); + status = RunKernel(kernel1, global1, local1, kernelEvent.pointer()); + if (ErrorIn(status)) { return status; } + eventWaitList.push_back(kernelEvent); + + // Sets the arguments for the epilogue kernel + kernel2.SetArgument(0, temp_buffer1()); + kernel2.SetArgument(1, temp_buffer2()); + kernel2.SetArgument(2, imax_buffer()); + kernel2.SetArgument(3, static_cast(imax_offset)); + + // Launches the epilogue kernel + auto global2 = std::vector{db_["WGS2"]}; + auto local2 = std::vector{db_["WGS2"]}; + status = RunKernel(kernel2, global2, local2, event_, eventWaitList); + if (ErrorIn(status)) { return status; } + + // Succesfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xamax; +template class Xamax; +template class Xamax; +template class Xamax; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index aa61c2ec..13be921a 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -154,6 +154,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name auto dot_offsets = std::vector{args.dot_offset}; auto nrm2_offsets = std::vector{args.nrm2_offset}; auto asum_offsets = std::vector{args.asum_offset}; + auto imax_offsets = std::vector{args.imax_offset}; auto alphas = std::vector{args.alpha}; auto betas = std::vector{args.beta}; auto x_sizes = std::vector{args.x_size}; @@ -195,6 +196,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name if (option == kArgDotOffset) { dot_offsets = tester.kOffsets; } if (option == kArgNrm2Offset) { nrm2_offsets = tester.kOffsets; } if (option == kArgAsumOffset) { asum_offsets = tester.kOffsets; } + if (option == kArgImaxOffset) { imax_offsets = tester.kOffsets; } if (option == kArgAlpha) { alphas = tester.kAlphaValues; } if (option == kArgBeta) { betas = tester.kBetaValues; } @@ -236,10 +238,12 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name for (auto &dot_offset: dot_offsets) { r_args.dot_offset = dot_offset; for (auto &nrm2_offset: nrm2_offsets) { r_args.nrm2_offset = nrm2_offset; for (auto &asum_offset: asum_offsets) { r_args.asum_offset = asum_offset; - for (auto &alpha: alphas) { r_args.alpha = alpha; - for (auto &beta: betas) { r_args.beta = beta; - C::SetSizes(r_args); - regular_test_vector.push_back(r_args); + for (auto &imax_offset: imax_offsets) { r_args.imax_offset = imax_offset; + for (auto &alpha: alphas) { r_args.alpha = alpha; + for (auto &beta: betas) { r_args.beta = beta; + C::SetSizes(r_args); + regular_test_vector.push_back(r_args); + } } } } diff --git a/test/performance/client.cc b/test/performance/client.cc index f22c9666..9aaf1e4e 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -84,6 +84,7 @@ Arguments Client::ParseArguments(int argc, char *argv[], const GetMetric if (o == kArgDotOffset) { args.dot_offset = GetArgument(argc, argv, help, kArgDotOffset, size_t{0}); } if (o == kArgNrm2Offset) { args.nrm2_offset = GetArgument(argc, argv, help, kArgNrm2Offset, size_t{0}); } if (o == kArgAsumOffset) { args.asum_offset = GetArgument(argc, argv, help, kArgAsumOffset, size_t{0}); } + if (o == kArgImaxOffset) { args.imax_offset = GetArgument(argc, argv, help, kArgImaxOffset, size_t{0}); } // Scalar values if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar()); } @@ -295,6 +296,7 @@ void Client::PrintTableRow(const Arguments& args, else if (o == kArgDotOffset) {integers.push_back(args.dot_offset); } else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); } + else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); } } auto strings = std::vector{}; for (auto &o: options_) { diff --git a/test/routines/level1/xamax.h b/test/routines/level1/xamax.h new file mode 100644 index 00000000..7b404dc3 --- /dev/null +++ b/test/routines/level1/xamax.h @@ -0,0 +1,139 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xamax routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XAMAX_H_ +#define CLBLAST_TEST_ROUTINES_XAMAX_H_ + +#include +#include + +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXamax { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgXInc, + kArgXOffset, kArgImaxOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeImax(const Arguments &args) { + return 1 + args.imax_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.x_size = GetSizeX(args); + args.scalar_size = GetSizeImax(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Amax(args.n, + buffers.scalar(), args.imax_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + clWaitForEvents(1, &event); + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXamax(args.n, + buffers.scalar(), args.imax_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXamax(args.n, + scalar_cpu, args.imax_offset, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.scalar_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &) { return 1; } // N/A for this routine + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t, const size_t) { + return args.imax_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return args.n; + } + static size_t GetBytes(const Arguments &args) { + return ((args.n) + 1) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XAMAX_H_ +#endif diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 955dc3ad..a44466c6 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -641,7 +641,7 @@ clblasStatus clblasXamax(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n); + auto scratch_buffer = Buffer(context, 2*n); return clblasiSamax(n, imax_buffer, imax_offset, x_buffer, x_offset, static_cast(x_inc), @@ -656,7 +656,7 @@ clblasStatus clblasXamax(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n); + auto scratch_buffer = Buffer(context, 2*n); return clblasiDamax(n, imax_buffer, imax_offset, x_buffer, x_offset, static_cast(x_inc), @@ -671,7 +671,7 @@ clblasStatus clblasXamax(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n); + auto scratch_buffer = Buffer(context, 2*n); return clblasiCamax(n, imax_buffer, imax_offset, x_buffer, x_offset, static_cast(x_inc), @@ -686,7 +686,7 @@ clblasStatus clblasXamax(const size_t n, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); - auto scratch_buffer = Buffer(context, n); + auto scratch_buffer = Buffer(context, 2*n); return clblasiZamax(n, imax_buffer, imax_offset, x_buffer, x_offset, static_cast(x_inc), From 3555cd043654ec24ff325bd6205281af790e50d2 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 27 Apr 2016 11:37:55 +0200 Subject: [PATCH 31/60] All CLBlast enum constants now have the same raw values as in the cblas standard --- include/clblast.h | 10 +++++----- include/clblast_c.h | 10 +++++----- scripts/generator/generator.py | 2 +- test/routines/level2/xgbmv.h | 4 ++-- test/routines/level2/xgemv.h | 4 ++-- test/routines/level2/xger.h | 2 +- test/routines/level2/xgerc.h | 2 +- test/routines/level2/xgeru.h | 2 +- test/routines/level2/xhbmv.h | 4 ++-- test/routines/level2/xhemv.h | 4 ++-- test/routines/level2/xher.h | 4 ++-- test/routines/level2/xher2.h | 4 ++-- test/routines/level2/xhpmv.h | 4 ++-- test/routines/level2/xhpr.h | 4 ++-- test/routines/level2/xhpr2.h | 4 ++-- test/routines/level2/xsbmv.h | 4 ++-- test/routines/level2/xspmv.h | 4 ++-- test/routines/level2/xspr.h | 4 ++-- test/routines/level2/xspr2.h | 4 ++-- test/routines/level2/xsymv.h | 4 ++-- test/routines/level2/xsyr.h | 4 ++-- test/routines/level2/xsyr2.h | 4 ++-- test/routines/level2/xtbmv.h | 8 ++++---- test/routines/level2/xtpmv.h | 8 ++++---- test/routines/level2/xtrmv.h | 8 ++++---- test/routines/level3/xgemm.h | 6 +++--- test/routines/level3/xhemm.h | 6 +++--- test/routines/level3/xher2k.h | 6 +++--- test/routines/level3/xherk.h | 6 +++--- test/routines/level3/xsymm.h | 6 +++--- test/routines/level3/xsyr2k.h | 6 +++--- test/routines/level3/xsyrk.h | 6 +++--- test/routines/level3/xtrmm.h | 10 +++++----- test/wrapper_clblas.h | 7 +++++++ 34 files changed, 91 insertions(+), 84 deletions(-) diff --git a/include/clblast.h b/include/clblast.h index 57948581..f73acb57 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -73,11 +73,11 @@ enum class StatusCode { }; // Matrix layout and transpose types -enum class Layout { kRowMajor, kColMajor }; -enum class Transpose { kNo, kYes, kConjugate }; -enum class Side { kLeft, kRight }; -enum class Triangle { kUpper, kLower }; -enum class Diagonal { kUnit, kNonUnit }; +enum class Layout { kRowMajor = 101, kColMajor = 102 }; +enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 }; +enum class Triangle { kUpper = 121, kLower = 122 }; +enum class Diagonal { kNonUnit = 131, kUnit = 132 }; +enum class Side { kLeft = 141, kRight = 142 }; // Precision scoped enum (values in bits) enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64, diff --git a/include/clblast_c.h b/include/clblast_c.h index 92f4afe5..8c0a0792 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -82,11 +82,11 @@ typedef enum StatusCode_ { } StatusCode; // Matrix layout and transpose types -typedef enum Layout_ { kRowMajor, kColMajor } Layout; -typedef enum Transpose_ { kNo, kYes, kConjugate } Transpose; -typedef enum Side_ { kLeft, kRight } Side; -typedef enum Triangle_ { kUpper, kLower } Triangle; -typedef enum Diagonal_ { kUnit, kNonUnit } Diagonal; +typedef enum Layout_ { kRowMajor = 101, kColMajor = 102 } Layout; +typedef enum Transpose_ { kNo = 111, kYes = 112, kConjugate = 113 } Transpose; +typedef enum Triangle_ { kUpper = 121, kLower = 122 } Triangle; +typedef enum Diagonal_ { kNonUnit = 131, kUnit = 132 } Diagonal; +typedef enum Side_ { kLeft = 141, kRight = 142 } Side; // Precision scoped enum (values in bits) typedef enum Precision_ { kHalf = 16, kSingle = 32, kDouble = 64, diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index d8bd4e2c..9a520fac 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -294,7 +294,7 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 67, 93, 22, 22, 38] +header_lines = [84, 67, 93, 22, 29, 38] footer_lines = [6, 3, 9, 2, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h index b875075d..c88cdf2a 100644 --- a/test/routines/level2/xgbmv.h +++ b/test/routines/level2/xgbmv.h @@ -99,8 +99,8 @@ class TestXgbmv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXgbmv(static_cast(args.layout), - static_cast(args.a_transpose), + auto status = clblasXgbmv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h index a70ccd34..cf63d55f 100644 --- a/test/routines/level2/xgemv.h +++ b/test/routines/level2/xgemv.h @@ -99,8 +99,8 @@ class TestXgemv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXgemv(static_cast(args.layout), - static_cast(args.a_transpose), + auto status = clblasXgemv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.a_transpose), args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, diff --git a/test/routines/level2/xger.h b/test/routines/level2/xger.h index 32c2a505..ae142e2e 100644 --- a/test/routines/level2/xger.h +++ b/test/routines/level2/xger.h @@ -95,7 +95,7 @@ class TestXger { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXger(static_cast(args.layout), + auto status = clblasXger(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, diff --git a/test/routines/level2/xgerc.h b/test/routines/level2/xgerc.h index 4b6954f6..b236aef6 100644 --- a/test/routines/level2/xgerc.h +++ b/test/routines/level2/xgerc.h @@ -95,7 +95,7 @@ class TestXgerc { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXgerc(static_cast(args.layout), + auto status = clblasXgerc(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, diff --git a/test/routines/level2/xgeru.h b/test/routines/level2/xgeru.h index 295e69e5..3d3fa439 100644 --- a/test/routines/level2/xgeru.h +++ b/test/routines/level2/xgeru.h @@ -95,7 +95,7 @@ class TestXgeru { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXgeru(static_cast(args.layout), + auto status = clblasXgeru(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h index e0bdc4da..4098639a 100644 --- a/test/routines/level2/xhbmv.h +++ b/test/routines/level2/xhbmv.h @@ -93,8 +93,8 @@ class TestXhbmv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXhbmv(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXhbmv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, diff --git a/test/routines/level2/xhemv.h b/test/routines/level2/xhemv.h index fa242961..5652872d 100644 --- a/test/routines/level2/xhemv.h +++ b/test/routines/level2/xhemv.h @@ -93,8 +93,8 @@ class TestXhemv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXhemv(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXhemv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, diff --git a/test/routines/level2/xher.h b/test/routines/level2/xher.h index 7d0e8cc3..3bbf0887 100644 --- a/test/routines/level2/xher.h +++ b/test/routines/level2/xher.h @@ -88,8 +88,8 @@ class TestXher { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXher(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXher(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.a_mat(), args.a_offset, args.a_ld, diff --git a/test/routines/level2/xher2.h b/test/routines/level2/xher2.h index 445bba74..dc7fbe73 100644 --- a/test/routines/level2/xher2.h +++ b/test/routines/level2/xher2.h @@ -93,8 +93,8 @@ class TestXher2 { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXher2(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXher2(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h index 406e564f..df5a90ee 100644 --- a/test/routines/level2/xhpmv.h +++ b/test/routines/level2/xhpmv.h @@ -93,8 +93,8 @@ class TestXhpmv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXhpmv(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXhpmv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.ap_mat(), args.ap_offset, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, diff --git a/test/routines/level2/xhpr.h b/test/routines/level2/xhpr.h index 6f56d3f3..0db11db0 100644 --- a/test/routines/level2/xhpr.h +++ b/test/routines/level2/xhpr.h @@ -88,8 +88,8 @@ class TestXhpr { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXhpr(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXhpr(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.ap_mat(), args.ap_offset, diff --git a/test/routines/level2/xhpr2.h b/test/routines/level2/xhpr2.h index 43889cb9..e1e5b4c5 100644 --- a/test/routines/level2/xhpr2.h +++ b/test/routines/level2/xhpr2.h @@ -93,8 +93,8 @@ class TestXhpr2 { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXhpr2(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXhpr2(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h index 9a5c5140..fce88f4c 100644 --- a/test/routines/level2/xsbmv.h +++ b/test/routines/level2/xsbmv.h @@ -93,8 +93,8 @@ class TestXsbmv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXsbmv(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXsbmv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h index 913af0cd..2fdba77a 100644 --- a/test/routines/level2/xspmv.h +++ b/test/routines/level2/xspmv.h @@ -93,8 +93,8 @@ class TestXspmv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXspmv(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXspmv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.ap_mat(), args.ap_offset, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, diff --git a/test/routines/level2/xspr.h b/test/routines/level2/xspr.h index bab5c541..dcacc5de 100644 --- a/test/routines/level2/xspr.h +++ b/test/routines/level2/xspr.h @@ -88,8 +88,8 @@ class TestXspr { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXspr(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXspr(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.ap_mat(), args.ap_offset, diff --git a/test/routines/level2/xspr2.h b/test/routines/level2/xspr2.h index 41a04cc0..69fda2fb 100644 --- a/test/routines/level2/xspr2.h +++ b/test/routines/level2/xspr2.h @@ -93,8 +93,8 @@ class TestXspr2 { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXspr2(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXspr2(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, diff --git a/test/routines/level2/xsymv.h b/test/routines/level2/xsymv.h index 0576bc1f..16f94d6f 100644 --- a/test/routines/level2/xsymv.h +++ b/test/routines/level2/xsymv.h @@ -93,8 +93,8 @@ class TestXsymv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXsymv(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXsymv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, args.beta, diff --git a/test/routines/level2/xsyr.h b/test/routines/level2/xsyr.h index 062eea5a..a66dd271 100644 --- a/test/routines/level2/xsyr.h +++ b/test/routines/level2/xsyr.h @@ -88,8 +88,8 @@ class TestXsyr { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXsyr(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXsyr(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.a_mat(), args.a_offset, args.a_ld, diff --git a/test/routines/level2/xsyr2.h b/test/routines/level2/xsyr2.h index 50bc3cea..a36815e5 100644 --- a/test/routines/level2/xsyr2.h +++ b/test/routines/level2/xsyr2.h @@ -93,8 +93,8 @@ class TestXsyr2 { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXsyr2(static_cast(args.layout), - static_cast(args.triangle), + auto status = clblasXsyr2(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), args.n, args.alpha, buffers.x_vec(), args.x_offset, args.x_inc, buffers.y_vec(), args.y_offset, args.y_inc, diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h index 600b4131..1425b60b 100644 --- a/test/routines/level2/xtbmv.h +++ b/test/routines/level2/xtbmv.h @@ -87,10 +87,10 @@ class TestXtbmv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXtbmv(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), + auto status = clblasXtbmv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), + convertToCLBLAS(args.a_transpose), + convertToCLBLAS(args.diagonal), args.n, args.kl, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h index fc0cf393..a834b437 100644 --- a/test/routines/level2/xtpmv.h +++ b/test/routines/level2/xtpmv.h @@ -87,10 +87,10 @@ class TestXtpmv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXtpmv(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), + auto status = clblasXtpmv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), + convertToCLBLAS(args.a_transpose), + convertToCLBLAS(args.diagonal), args.n, buffers.ap_mat(), args.ap_offset, buffers.x_vec(), args.x_offset, args.x_inc, diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h index fec72124..cd502d5d 100644 --- a/test/routines/level2/xtrmv.h +++ b/test/routines/level2/xtrmv.h @@ -87,10 +87,10 @@ class TestXtrmv { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXtrmv(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), + auto status = clblasXtrmv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), + convertToCLBLAS(args.a_transpose), + convertToCLBLAS(args.diagonal), args.n, buffers.a_mat(), args.a_offset, args.a_ld, buffers.x_vec(), args.x_offset, args.x_inc, diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h index 49a92936..10fc2803 100644 --- a/test/routines/level3/xgemm.h +++ b/test/routines/level3/xgemm.h @@ -101,9 +101,9 @@ class TestXgemm { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXgemm(static_cast(args.layout), - static_cast(args.a_transpose), - static_cast(args.b_transpose), + auto status = clblasXgemm(convertToCLBLAS(args.layout), + convertToCLBLAS(args.a_transpose), + convertToCLBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h index 40538417..edc71024 100644 --- a/test/routines/level3/xhemm.h +++ b/test/routines/level3/xhemm.h @@ -101,9 +101,9 @@ class TestXhemm { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXhemm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), + auto status = clblasXhemm(convertToCLBLAS(args.layout), + convertToCLBLAS(args.side), + convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h index 1ea2ad36..a78e1293 100644 --- a/test/routines/level3/xher2k.h +++ b/test/routines/level3/xher2k.h @@ -101,9 +101,9 @@ class TestXher2k { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; - auto status = clblasXher2k(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), + auto status = clblasXher2k(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), + convertToCLBLAS(args.a_transpose), args.n, args.k, alpha2, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h index 75a7c405..245293d6 100644 --- a/test/routines/level3/xherk.h +++ b/test/routines/level3/xherk.h @@ -91,9 +91,9 @@ class TestXherk { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXherk(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), + auto status = clblasXherk(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), + convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h index f867c238..e638b735 100644 --- a/test/routines/level3/xsymm.h +++ b/test/routines/level3/xsymm.h @@ -101,9 +101,9 @@ class TestXsymm { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXsymm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), + auto status = clblasXsymm(convertToCLBLAS(args.layout), + convertToCLBLAS(args.side), + convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h index be4e1851..abac20f4 100644 --- a/test/routines/level3/xsyr2k.h +++ b/test/routines/level3/xsyr2k.h @@ -99,9 +99,9 @@ class TestXsyr2k { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXsyr2k(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), + auto status = clblasXsyr2k(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), + convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, args.beta, diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h index 7675e2aa..8a5fcb5f 100644 --- a/test/routines/level3/xsyrk.h +++ b/test/routines/level3/xsyrk.h @@ -91,9 +91,9 @@ class TestXsyrk { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXsyrk(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), + auto status = clblasXsyrk(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), + convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, args.beta, buffers.c_mat(), args.c_offset, args.c_ld, diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h index a085cb15..7c9c21bc 100644 --- a/test/routines/level3/xtrmm.h +++ b/test/routines/level3/xtrmm.h @@ -91,11 +91,11 @@ class TestXtrmm { static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; - auto status = clblasXtrmm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), + auto status = clblasXtrmm(convertToCLBLAS(args.layout), + convertToCLBLAS(args.side), + convertToCLBLAS(args.triangle), + convertToCLBLAS(args.a_transpose), + convertToCLBLAS(args.diagonal), args.m, args.n, args.alpha, buffers.a_mat(), args.a_offset, args.a_ld, buffers.b_mat(), args.b_offset, args.b_ld, diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index a44466c6..23c55373 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -21,6 +21,13 @@ namespace clblast { +// Conversions from CLBlast types +clblasOrder convertToCLBLAS(const Layout v) { return (v == Layout::kRowMajor) ? clblasRowMajor : clblasColumnMajor; } +clblasTranspose convertToCLBLAS(const Transpose v) { return (v == Transpose::kNo) ? clblasNoTrans : (v == Transpose::kYes) ? clblasTrans : clblasConjTrans; } +clblasUplo convertToCLBLAS(const Triangle v) { return (v == Triangle::kUpper) ? clblasUpper : clblasLower; } +clblasDiag convertToCLBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? clblasUnit : clblasNonUnit; } +clblasSide convertToCLBLAS(const Side v) { return (v == Side::kLeft) ? clblasLeft : clblasRight; } + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= From 226e834d0a6569f8142ab0cde14e6e273486a277 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 27 Apr 2016 14:38:30 +0200 Subject: [PATCH 32/60] Added a '-verbose' option to the test binaries to report errors in more detail if needed --- include/internal/utilities.h | 3 +- test/correctness/testblas.cc | 9 ++++ test/correctness/testblas.h | 1 + test/correctness/tester.cc | 82 +++++++++++++++++++++--------------- test/correctness/tester.h | 6 +++ 5 files changed, 64 insertions(+), 37 deletions(-) diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 75b3d27d..82cd7f44 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -82,6 +82,7 @@ constexpr auto kArgNumRuns = "runs"; // The client-specific arguments in string form constexpr auto kArgFullTest = "full_test"; +constexpr auto kArgVerbose = "verbose"; // The common arguments in string form constexpr auto kArgPlatform = "platform"; @@ -140,8 +141,6 @@ struct Arguments { size_t step = 1; size_t num_steps = 0; size_t num_runs = 10; - // Tester-specific arguments - bool full_test = false; // Common arguments size_t platform_id = 0; size_t device_id = 0; diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index cc9a5adb..a5ccefe0 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -12,6 +12,7 @@ // ================================================================================================= #include +#include #include "correctness/testblas.h" @@ -141,9 +142,17 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st auto index = get_index_(args, id1, id2); if (!TestSimilarity(result1[index], result2[index])) { errors++; + if (verbose_) { + fprintf(stdout, "\n Incorrect value found: "); + std::cout << result1[index]; + fprintf(stdout, " (reference) versus "); + std::cout << result2[index]; + fprintf(stdout, " (CLBlast)"); + } } } } + if (verbose_ && errors > 0) { fprintf(stdout, "\n "); } // Tests the error count (should be zero) TestErrorCount(errors, get_id1_(args)*get_id2_(args), args); diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 13be921a..8fd1b1e2 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -35,6 +35,7 @@ class TestBlas: public Tester { using Tester::context_; using Tester::queue_; using Tester::full_test_; + using Tester::verbose_; using Tester::device_; // Uses several helper functions from the Tester class diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 872a131a..6c504b4e 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -32,6 +32,7 @@ Tester::Tester(int argc, char *argv[], const bool silent, context_(Context(device_)), queue_(Queue(context_, device_)), full_test_(CheckArgument(argc, argv, help_, kArgFullTest)), + verbose_(CheckArgument(argc, argv, help_, kArgVerbose)), error_log_{}, num_passed_{0}, num_skipped_{0}, @@ -126,41 +127,8 @@ void Tester::TestEnd() { tests_failed_ += num_skipped_; tests_failed_ += num_failed_; - // Prints details of all error occurences for these tests - for (auto &entry: error_log_) { - if (entry.error_percentage != kStatusError) { - fprintf(stdout, " Error rate %.1lf%%: ", entry.error_percentage); - } - else { - fprintf(stdout, " Status code %d (expected %d): ", entry.status_found, entry.status_expect); - } - for (auto &o: options_) { - if (o == kArgM) { fprintf(stdout, "%s=%zu ", kArgM, entry.args.m); } - if (o == kArgN) { fprintf(stdout, "%s=%zu ", kArgN, entry.args.n); } - if (o == kArgK) { fprintf(stdout, "%s=%zu ", kArgK, entry.args.k); } - if (o == kArgKU) { fprintf(stdout, "%s=%zu ", kArgKU, entry.args.ku); } - if (o == kArgKL) { fprintf(stdout, "%s=%zu ", kArgKL, entry.args.kl); } - if (o == kArgLayout) { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);} - if (o == kArgATransp) { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);} - if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);} - if (o == kArgSide) { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);} - if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);} - if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);} - if (o == kArgXInc) { fprintf(stdout, "%s=%zu ", kArgXInc, entry.args.x_inc);} - if (o == kArgYInc) { fprintf(stdout, "%s=%zu ", kArgYInc, entry.args.y_inc);} - if (o == kArgXOffset) { fprintf(stdout, "%s=%zu ", kArgXOffset, entry.args.x_offset);} - if (o == kArgYOffset) { fprintf(stdout, "%s=%zu ", kArgYOffset, entry.args.y_offset);} - if (o == kArgALeadDim) { fprintf(stdout, "%s=%zu ", kArgALeadDim, entry.args.a_ld);} - if (o == kArgBLeadDim) { fprintf(stdout, "%s=%zu ", kArgBLeadDim, entry.args.b_ld);} - if (o == kArgCLeadDim) { fprintf(stdout, "%s=%zu ", kArgCLeadDim, entry.args.c_ld);} - if (o == kArgAOffset) { fprintf(stdout, "%s=%zu ", kArgAOffset, entry.args.a_offset);} - if (o == kArgBOffset) { fprintf(stdout, "%s=%zu ", kArgBOffset, entry.args.b_offset);} - if (o == kArgCOffset) { fprintf(stdout, "%s=%zu ", kArgCOffset, entry.args.c_offset);} - if (o == kArgAPOffset) { fprintf(stdout, "%s=%zu ", kArgAPOffset, entry.args.ap_offset);} - if (o == kArgDotOffset){ fprintf(stdout, "%s=%zu ", kArgDotOffset, entry.args.dot_offset);} - } - fprintf(stdout, "\n"); - } + // Prints the errors + PrintErrorLog(error_log_); // Prints a test summary auto pass_rate = 100*num_passed_ / static_cast(num_passed_ + num_skipped_ + num_failed_); @@ -230,6 +198,11 @@ void Tester::TestErrorCodes(const StatusCode clblas_status, const StatusCod else { PrintTestResult(kErrorStatus); ReportError({clblas_status, clblast_status, kStatusError, args}); + if (verbose_) { + fprintf(stdout, "\n"); + PrintErrorLog({{clblas_status, clblast_status, kStatusError, args}}); + fprintf(stdout, " "); + } } } @@ -274,6 +247,45 @@ void Tester::PrintTestResult(const std::string &message) { print_count_++; } +// Prints details of errors occurred in a given error log +template +void Tester::PrintErrorLog(const std::vector &error_log) { + for (auto &entry: error_log) { + if (entry.error_percentage != kStatusError) { + fprintf(stdout, " Error rate %.1lf%%: ", entry.error_percentage); + } + else { + fprintf(stdout, " Status code %d (expected %d): ", entry.status_found, entry.status_expect); + } + for (auto &o: options_) { + if (o == kArgM) { fprintf(stdout, "%s=%zu ", kArgM, entry.args.m); } + if (o == kArgN) { fprintf(stdout, "%s=%zu ", kArgN, entry.args.n); } + if (o == kArgK) { fprintf(stdout, "%s=%zu ", kArgK, entry.args.k); } + if (o == kArgKU) { fprintf(stdout, "%s=%zu ", kArgKU, entry.args.ku); } + if (o == kArgKL) { fprintf(stdout, "%s=%zu ", kArgKL, entry.args.kl); } + if (o == kArgLayout) { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);} + if (o == kArgATransp) { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);} + if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);} + if (o == kArgSide) { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);} + if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);} + if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);} + if (o == kArgXInc) { fprintf(stdout, "%s=%zu ", kArgXInc, entry.args.x_inc);} + if (o == kArgYInc) { fprintf(stdout, "%s=%zu ", kArgYInc, entry.args.y_inc);} + if (o == kArgXOffset) { fprintf(stdout, "%s=%zu ", kArgXOffset, entry.args.x_offset);} + if (o == kArgYOffset) { fprintf(stdout, "%s=%zu ", kArgYOffset, entry.args.y_offset);} + if (o == kArgALeadDim) { fprintf(stdout, "%s=%zu ", kArgALeadDim, entry.args.a_ld);} + if (o == kArgBLeadDim) { fprintf(stdout, "%s=%zu ", kArgBLeadDim, entry.args.b_ld);} + if (o == kArgCLeadDim) { fprintf(stdout, "%s=%zu ", kArgCLeadDim, entry.args.c_ld);} + if (o == kArgAOffset) { fprintf(stdout, "%s=%zu ", kArgAOffset, entry.args.a_offset);} + if (o == kArgBOffset) { fprintf(stdout, "%s=%zu ", kArgBOffset, entry.args.b_offset);} + if (o == kArgCOffset) { fprintf(stdout, "%s=%zu ", kArgCOffset, entry.args.c_offset);} + if (o == kArgAPOffset) { fprintf(stdout, "%s=%zu ", kArgAPOffset, entry.args.ap_offset);} + if (o == kArgDotOffset){ fprintf(stdout, "%s=%zu ", kArgDotOffset, entry.args.dot_offset);} + } + fprintf(stdout, "\n"); + } +} + // ================================================================================================= // Below are the non-member functions (separated because of otherwise required partial class // template specialization) diff --git a/test/correctness/tester.h b/test/correctness/tester.h index d489f829..3534dffb 100644 --- a/test/correctness/tester.h +++ b/test/correctness/tester.h @@ -96,6 +96,9 @@ class Tester { // Whether or not to run the full test-suite or just a smoke test const bool full_test_; + // Whether or not to print extra information when testing + const bool verbose_; + // Retrieves the offset values to test with const std::vector GetOffsets() const; @@ -109,6 +112,9 @@ class Tester { // Prints the error or success symbol to screen void PrintTestResult(const std::string &message); + // Prints an error log + void PrintErrorLog(const std::vector &error_log); + // Logging and counting occurrences of errors std::vector error_log_; size_t num_passed_; From 44bdb60e834ef015ee4cb25a6f0eba2a092291f0 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 27 Apr 2016 14:42:30 +0200 Subject: [PATCH 33/60] Relaxed the absolute error margin for floating-point value comparisons to 1e-4 --- test/correctness/tester.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 6c504b4e..51d83362 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -299,7 +299,7 @@ bool TestSimilarity(const T val1, const T val2) { // Set the allowed error margin for floating-point comparisons constexpr auto kErrorMarginRelative = T{0.025}; - constexpr auto kErrorMarginAbsolute = T{1.0e-6}; + constexpr auto kErrorMarginAbsolute = T{1.0e-4}; // Shortcut, handles infinities if (val1 == val2) { From 82be8f211cbd50d2d75fe78d8af4a1da04a0582b Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 27 Apr 2016 16:02:13 +0200 Subject: [PATCH 34/60] Moved all cache-related functions to a separate file; added a ClearCompiledProgramCache function to clear the cache --- CHANGELOG | 2 + CMakeLists.txt | 3 +- include/clblast.h | 7 ++++ include/clblast_c.h | 6 +++ include/internal/cache.h | 72 +++++++++++++++++++++++++++++++++ include/internal/routine.h | 37 ++++++----------- scripts/generator/generator.py | 4 +- src/cache.cc | 73 ++++++++++++++++++++++++++++++++++ src/clblast.cc | 6 +++ src/clblast_c.cc | 7 ++++ src/routine.cc | 40 +------------------ 11 files changed, 191 insertions(+), 66 deletions(-) create mode 100644 include/internal/cache.h create mode 100644 src/cache.cc diff --git a/CHANGELOG b/CHANGELOG index c9770dc2..4c6a9be5 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,8 @@ Development version (next release) - Made the library thread-safe - Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries - Fixed the use of events within the library +- Changed the enum parameters to match the raw values of the cblas standard +- Added a function to clear the cache of previously compiled programs - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 * SASUM/DASUM/ScASUM/DzASUM diff --git a/CMakeLists.txt b/CMakeLists.txt index efdf6be0..6abfc09f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,7 +131,8 @@ set(PRECISIONS 32 64 3232 6464) # ================================================================================================== # Gathers all source-files -set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc src/clblast_c.cc) +set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/cache.cc + src/utilities.cc src/clblast_c.cc) foreach(ROUTINE ${LEVEL1_ROUTINES}) set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc) endforeach() diff --git a/include/clblast.h b/include/clblast.h index f73acb57..4a3ec9b6 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -539,6 +539,13 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c cl_command_queue* queue, cl_event* event = nullptr); // ================================================================================================= + +// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on +// for the same device. This cache can be cleared to free up system memory or in case of debugging. +StatusCode ClearCompiledProgramCache(); + +// ================================================================================================= + } // namespace clblast // CLBLAST_CLBLAST_H_ diff --git a/include/clblast_c.h b/include/clblast_c.h index 8c0a0792..1ca300ca 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -1036,6 +1036,12 @@ StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const T // ================================================================================================= +// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on +// for the same device. This cache can be cleared to free up system memory or in case of debugging. +StatusCode PUBLIC_API CLBlastClearCompiledProgramCache(); + +// ================================================================================================= + #ifdef __cplusplus } // extern "C" #endif diff --git a/include/internal/cache.h b/include/internal/cache.h new file mode 100644 index 00000000..44fad68d --- /dev/null +++ b/include/internal/cache.h @@ -0,0 +1,72 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the caching functionality of compiled binaries. +// +// ================================================================================================= + +#ifndef CLBLAST_CACHE_H_ +#define CLBLAST_CACHE_H_ + +#include +#include +#include + +#include "internal/utilities.h" + +namespace clblast { +namespace cache { +// ================================================================================================= + +// The cache of compiled OpenCL programs, along with some meta-data +struct ProgramCache { + Program program; + std::string device_name; + Precision precision; + std::string routine_name_; + + // Finds out whether the properties match + bool MatchInCache(const std::string &ref_device, const Precision &ref_precision, + const std::string &ref_routine) { + return (device_name == ref_device && + precision == ref_precision && + routine_name_ == ref_routine); + } +}; + +// The actual cache, implemented as a vector of the above data-type, and its mutex +static std::vector program_cache_; +static std::mutex program_cache_mutex_; + +// ================================================================================================= + +// Stores the compiled program in the cache +void StoreProgramToCache(const Program& program, const std::string &device_name, + const Precision &precision, const std::string &routine_name); + +// Queries the cache and retrieves a matching program. Assumes that the match is available, throws +// otherwise. +const Program& GetProgramFromCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name); + +// Queries the cache to see whether or not the compiled kernel is already there +bool ProgramIsInCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name); + +// ================================================================================================= + +// Clears the cache of stored program binaries +StatusCode ClearCompiledProgramCache(); + +// ================================================================================================= +} // namespace cache +} // namespace clblast + +// CLBLAST_CACHE_H_ +#endif diff --git a/include/internal/routine.h b/include/internal/routine.h index b2b6f622..013769d8 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -18,8 +18,8 @@ #include #include -#include +#include "internal/cache.h" #include "internal/utilities.h" #include "internal/database.h" @@ -31,26 +31,6 @@ template class Routine { public: - // The cache of compiled OpenCL programs, along with some meta-data - struct ProgramCache { - Program program; - std::string device_name; - Precision precision; - std::string routine_name_; - - // Finds out whether the properties match - bool MatchInCache(const std::string &ref_device, const Precision &ref_precision, - const std::string &ref_routine) { - return (device_name == ref_device && - precision == ref_precision && - routine_name_ == ref_routine); - } - }; - - // The actual cache, implemented as a vector of the above data-type, and its mutex - static std::vector program_cache_; - static std::mutex program_cache_mutex_; - // Helper functions which check for errors in the status code static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); } @@ -103,12 +83,21 @@ class Routine { const bool do_transpose, const bool do_conjugate, const bool upper = false, const bool lower = false, const bool diagonal_imag_zero = false); - + + // Stores a newly compiled program into the cache + void StoreProgramToCache(const Program& program) const { + return cache::StoreProgramToCache(program, device_name_, precision_, routine_name_); + } + // Queries the cache and retrieve either a matching program or a boolean whether a match exists. // The first assumes that the program is available in the cache and will throw an exception // otherwise. - const Program& GetProgramFromCache() const; - bool ProgramIsInCache() const; + const Program& GetProgramFromCache() const { + return cache::GetProgramFromCache(device_name_, precision_, routine_name_); + } + bool ProgramIsInCache() const { + return cache::ProgramIsInCache(device_name_, precision_, routine_name_); + } // Non-static variable for the precision. Note that the same variable (but static) might exist in // a derived class. diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 9a520fac..9de03567 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -294,8 +294,8 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 67, 93, 22, 29, 38] -footer_lines = [6, 3, 9, 2, 6, 6] +header_lines = [84, 68, 93, 22, 29, 38] +footer_lines = [13, 8, 15, 9, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise for f in files: diff --git a/src/cache.cc b/src/cache.cc new file mode 100644 index 00000000..beeb1b35 --- /dev/null +++ b/src/cache.cc @@ -0,0 +1,73 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the caching functionality of compiled binaries. +// +// ================================================================================================= + +#include +#include +#include + +#include "internal/cache.h" + +namespace clblast { +namespace cache { +// ================================================================================================= + +// Stores the compiled program in the cache +void StoreProgramToCache(const Program& program, const std::string &device_name, + const Precision &precision, const std::string &routine_name) { + program_cache_mutex_.lock(); + program_cache_.push_back({program, device_name, precision, routine_name}); + program_cache_mutex_.unlock(); +} + +// Queries the cache and retrieves a matching program. Assumes that the match is available, throws +// otherwise. +const Program& GetProgramFromCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name) { + program_cache_mutex_.lock(); + for (auto &cached_program: program_cache_) { + if (cached_program.MatchInCache(device_name, precision, routine_name)) { + program_cache_mutex_.unlock(); + return cached_program.program; + } + } + program_cache_mutex_.unlock(); + throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); +} + +// Queries the cache to see whether or not the compiled kernel is already there +bool ProgramIsInCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name) { + program_cache_mutex_.lock(); + for (auto &cached_program: program_cache_) { + if (cached_program.MatchInCache(device_name, precision, routine_name)) { + program_cache_mutex_.unlock(); + return true; + } + } + program_cache_mutex_.unlock(); + return false; +} + +// ================================================================================================= + +// Clears the cache of stored program binaries +StatusCode ClearCompiledProgramCache() { + program_cache_mutex_.lock(); + program_cache_.clear(); + program_cache_mutex_.unlock(); + return StatusCode::kSuccess; +} + +// ================================================================================================= +} // namespace cache +} // namespace clblast diff --git a/src/clblast.cc b/src/clblast.cc index 145b6bf6..b6efd185 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -17,6 +17,7 @@ #include "clblast.h" #include "internal/public_api.h" +#include "internal/cache.h" // BLAS level-1 includes #include "internal/routines/level1/xswap.h" @@ -1787,5 +1788,10 @@ template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Tri cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// ================================================================================================= + +// Clears the cache of stored program binaries +StatusCode ClearCompiledProgramCache() { return cache::ClearCompiledProgramCache(); } + // ================================================================================================= } // namespace clblast diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 23c96feb..6e238b77 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -2258,3 +2258,10 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri } // ================================================================================================= + +// Clears the cache of stored program binaries +StatusCode CLBlastClearCompiledProgramCache() { + return static_cast(clblast::ClearCompiledProgramCache()); +} + +// ================================================================================================= diff --git a/src/routine.cc b/src/routine.cc index b5ba63eb..e0a75e41 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -13,17 +13,12 @@ #include #include -#include #include "internal/routine.h" namespace clblast { // ================================================================================================= -// The cache of compiled OpenCL programs and its mutex for thread safety -template std::vector::ProgramCache> Routine::program_cache_; -template std::mutex Routine::program_cache_mutex_; - // Constructor: not much here, because no status codes can be returned template Routine::Routine(Queue &queue, EventPointer event, const std::string &name, @@ -102,9 +97,7 @@ StatusCode Routine::SetUp() { if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } // Store the compiled program in the cache (atomic for thread-safety) - program_cache_mutex_.lock(); - program_cache_.push_back({program, device_name_, precision_, routine_name_}); - program_cache_mutex_.unlock(); + StoreProgramToCache(program); } catch (...) { return StatusCode::kBuildProgramFailure; } } @@ -374,37 +367,6 @@ StatusCode Routine::PadCopyTransposeMatrix(EventPointer event, std::vector -const Program& Routine::GetProgramFromCache() const { - program_cache_mutex_.lock(); - for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { - program_cache_mutex_.unlock(); - return cached_program.program; - } - } - program_cache_mutex_.unlock(); - throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); -} - -// Queries the cache to see whether or not the compiled kernel is already there -template -bool Routine::ProgramIsInCache() const { - program_cache_mutex_.lock(); - for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { - program_cache_mutex_.unlock(); - return true; - } - } - program_cache_mutex_.unlock(); - return false; -} - -// ================================================================================================= - // Compiles the templated class template class Routine; template class Routine; From 8075934ca7696f92e779b6751980ace526a37bbe Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 27 Apr 2016 17:06:19 +0200 Subject: [PATCH 35/60] Added prototypes for non-BLAS routines: xSUM and IxMAX (non-absolute counterparts of xASUM and IxAMAX) --- include/clblast.h | 16 +- include/clblast_c.h | 38 ++++- scripts/generator/generator.py | 282 +++++++++++++++++---------------- scripts/generator/routine.py | 3 +- src/clblast.cc | 52 +++++- src/clblast_c.cc | 84 ++++++++++ 6 files changed, 333 insertions(+), 142 deletions(-) diff --git a/include/clblast.h b/include/clblast.h index 4a3ec9b6..f3b74f6e 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -188,13 +188,27 @@ StatusCode Asum(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM +template +StatusCode Sum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX template StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); +// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX +template +StatusCode Max(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index 1ca300ca..2f692b66 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -296,7 +296,25 @@ StatusCode PUBLIC_API CLBlastDzasum(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); -// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM +StatusCode PUBLIC_API CLBlastSsum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDsum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastScsum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastDzsum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX StatusCode PUBLIC_API CLBlastiSamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -314,6 +332,24 @@ StatusCode PUBLIC_API CLBlastiZamax(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX +StatusCode PUBLIC_API CLBlastiSmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiDmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiCmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiZmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 9de03567..cad9a82d 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -59,59 +59,61 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # Populates a list of routines routines = [ [ # Level 1: vector-vector - Routine(False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"), - Routine(False, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation"), - Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"), - Routine(False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"), - Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"), - Routine(True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling"), - Routine(True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy"), - Routine(True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector"), - Routine(True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors"), - Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors"), - Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), - Routine(True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), - Routine(True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), - Routine(True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maxium value in a vector"), + Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"), + Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation"), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"), + Routine(True, True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"), + Routine(True, True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling"), + Routine(True, True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy"), + Routine(True, True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector"), + Routine(True, True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors"), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors"), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), + Routine(False, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Sum of values in a vector"), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector"), + Routine(False, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector"), ], [ # Level 2: matrix-vector - Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), - Routine(True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication"), - Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication"), - Routine(True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication"), - Routine(True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication"), - Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication"), - Routine(True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication"), - Routine(True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication"), - Routine(True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication"), - Routine(True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication"), - Routine(True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication"), - Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations"), - Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations"), - Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations"), + Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication"), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication"), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication"), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication"), + Routine(True, True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication"), + Routine(True, True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication"), + Routine(True, True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication"), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication"), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication"), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication"), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations"), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations"), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations"), # Level 2: matrix update - Routine(True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update"), - Routine(True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update"), - Routine(True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update"), - Routine(True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update"), - Routine(True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update"), - Routine(True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update"), - Routine(True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update"), - Routine(True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update"), - Routine(True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update"), - Routine(True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update"), - Routine(True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update"), + Routine(True, True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update"), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update"), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update"), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update"), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update"), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update"), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update"), + Routine(True, True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update"), + Routine(True, True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update"), + Routine(True, True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update"), + Routine(True, True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update"), ], [ # Level 3: matrix-matrix - Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication"), - Routine(True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication"), - Routine(True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication"), - Routine(True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix"), - Routine(True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix"), - Routine(True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix"), - Routine(True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix"), - Routine(True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication"), - Routine(False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations"), + Routine(True, True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication"), + Routine(True, True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication"), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication"), + Routine(True, True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix"), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix"), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix"), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix"), + Routine(True, True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication"), + Routine(False, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations"), ]] # ================================================================================================== @@ -224,57 +226,59 @@ def clblast_c_cc(routines): def wrapper_clblas(routines): result = "" for routine in routines: - result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames()) - if routine.NoScalars(): - result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n" - for flavour in routine.flavours: - indent = " "*(17 + routine.Length()) - result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n" - arguments = routine.ArgumentsWrapperCL(flavour) - if routine.scratch: - result += " auto queue = Queue(queues[0]);\n" - result += " auto context = queue.GetContext();\n" - result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n" - arguments += ["scratch_buffer()"] - result += " return clblas"+flavour.name+routine.name+"(" - result += (",\n"+indent).join([a for a in arguments]) - result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" - result += "\n}\n" + if routine.has_tests: + result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames()) + if routine.NoScalars(): + result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n" + for flavour in routine.flavours: + indent = " "*(17 + routine.Length()) + result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n" + arguments = routine.ArgumentsWrapperCL(flavour) + if routine.scratch: + result += " auto queue = Queue(queues[0]);\n" + result += " auto context = queue.GetContext();\n" + result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n" + arguments += ["scratch_buffer()"] + result += " return clblas"+flavour.name+routine.name+"(" + result += (",\n"+indent).join([a for a in arguments]) + result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" + result += "\n}\n" return result # The wrapper to the reference CBLAS routines (for performance/correctness testing) def wrapper_cblas(routines): result = "" for routine in routines: - result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames()) - for flavour in routine.flavours: - indent = " "*(10 + routine.Length()) - result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n" - arguments = routine.ArgumentsWrapperC(flavour) + if routine.has_tests: + result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames()) + for flavour in routine.flavours: + indent = " "*(10 + routine.Length()) + result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n" + arguments = routine.ArgumentsWrapperC(flavour) - # Double-precision scalars - for scalar in routine.scalars: - if flavour.IsComplex(scalar): - result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n" + # Double-precision scalars + for scalar in routine.scalars: + if flavour.IsComplex(scalar): + result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n" - # Special case for scalar outputs - assignment = "" - postfix = "" - extra_argument = "" - for output_buffer in routine.outputs: - if output_buffer in routine.ScalarBuffersFirst(): - if flavour in [C,Z]: - postfix += "_sub" - indent += " " - extra_argument += ",\n"+indent+"reinterpret_cast(&"+output_buffer+"_buffer["+output_buffer+"_offset])" - else: - assignment = output_buffer+"_buffer["+output_buffer+"_offset] = " - indent += " "*len(assignment) + # Special case for scalar outputs + assignment = "" + postfix = "" + extra_argument = "" + for output_buffer in routine.outputs: + if output_buffer in routine.ScalarBuffersFirst(): + if flavour in [C,Z]: + postfix += "_sub" + indent += " " + extra_argument += ",\n"+indent+"reinterpret_cast(&"+output_buffer+"_buffer["+output_buffer+"_offset])" + else: + assignment = output_buffer+"_buffer["+output_buffer+"_offset] = " + indent += " "*len(assignment) - result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"(" - result += (",\n"+indent).join([a for a in arguments]) - result += extra_argument+");" - result += "\n}\n" + result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"(" + result += (",\n"+indent).join([a for a in arguments]) + result += extra_argument+");" + result += "\n}\n" return result # ================================================================================================== @@ -340,57 +344,59 @@ for i in xrange(0,len(files)): # Outputs all the correctness-test implementations for level in [1,2,3]: for routine in routines[level-1]: - filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc" - with open(filename, "w") as f: - body = "" - body += "#include \"correctness/testblas.h\"\n" - body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n" - body += "// Shortcuts to the clblast namespace\n" - body += "using float2 = clblast::float2;\n" - body += "using double2 = clblast::double2;\n\n" - body += "// Main function (not within the clblast namespace)\n" - body += "int main(int argc, char *argv[]) {\n" - not_first = "false" - for flavour in routine.flavours: - body += " clblast::RunTests(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Index of absolute maxium value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM +template +StatusCode Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sum(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX template StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, @@ -461,6 +486,31 @@ template StatusCode PUBLIC_API Amax(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX +template +StatusCode Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Max(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 6e238b77..e6270d57 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -475,6 +475,48 @@ StatusCode CLBlastDzasum(const size_t n, return static_cast(status); } +// SUM +StatusCode CLBlastSsum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDsum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastScsum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastDzsum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + // AMAX StatusCode CLBlastiSamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, @@ -517,6 +559,48 @@ StatusCode CLBlastiZamax(const size_t n, return static_cast(status); } +// MAX +StatusCode CLBlastiSmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiDmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiCmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiZmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= From 13eed1a0f973ff2090062a1ad4485896b22949b0 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 27 Apr 2016 17:59:28 +0200 Subject: [PATCH 36/60] Added missing namespace to the SGEMM example --- samples/sgemm.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/samples/sgemm.cc b/samples/sgemm.cc index 78f2dee8..2659d36c 100644 --- a/samples/sgemm.cc +++ b/samples/sgemm.cc @@ -84,15 +84,15 @@ int main() { // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision. auto queue_plain = queue(); - auto status = Gemm(clblast::Layout::kRowMajor, - clblast::Transpose::kNo, clblast::Transpose::kNo, - m, n, k, - alpha, - device_a(), 0, a_ld, - device_b(), 0, b_ld, - beta, - device_c(), 0, c_ld, - &queue_plain, &event); + auto status = clblast::Gemm(clblast::Layout::kRowMajor, + clblast::Transpose::kNo, clblast::Transpose::kNo, + m, n, k, + alpha, + device_a(), 0, a_ld, + device_b(), 0, b_ld, + beta, + device_c(), 0, c_ld, + &queue_plain, &event); // Record the execution time clWaitForEvents(1, &event); From d7ddbdeb1f416f56bc469d16c051551207274703 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 27 Apr 2016 18:07:30 +0200 Subject: [PATCH 37/60] Added non-absolute counter-parts xSUM and IxMAX of the BLAS routines xASUM and IxAMAX --- CHANGELOG | 2 + README.md | 122 +++++++++++++----------- include/clblast.h | 6 +- include/clblast_c.h | 12 +-- include/internal/routines/level1/xmax.h | 49 ++++++++++ include/internal/routines/level1/xsum.h | 49 ++++++++++ scripts/generator/generator.py | 6 +- scripts/generator/routine.py | 2 +- src/clblast.cc | 38 +++++--- src/clblast_c.cc | 16 ++-- src/kernels/common.opencl | 4 + src/kernels/level1/xamax.opencl | 15 ++- src/kernels/level1/xasum.opencl | 5 +- 13 files changed, 234 insertions(+), 92 deletions(-) create mode 100644 include/internal/routines/level1/xmax.h create mode 100644 include/internal/routines/level1/xsum.h diff --git a/CHANGELOG b/CHANGELOG index 4c6a9be5..787793f0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -9,7 +9,9 @@ Development version (next release) - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 * SASUM/DASUM/ScASUM/DzASUM + * SSUM/DSUM/ScSUM/DzSUM (non-absolute version of the above xASUM BLAS routines) * iSAMAX/iDAMAX/iCAMAX/iZAMAX + * iSMAX/iDMAX/iCMAX/iZMAX (non-absolute version of the above ixAMAX BLAS routines) Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 diff --git a/README.md b/README.md index b4f0981f..f2a85efc 100644 --- a/README.md +++ b/README.md @@ -169,64 +169,76 @@ These graphs can be generated automatically on your own device. First, compile C Supported routines ------------- -CLBlast is in active development but already supports almost all the BLAS routines. The currently supported routines are marked with '✔' in the following tables. Empty boxes represent routines that still need to be implemented in a future release, whereas routines marked with '-' are not part of BLAS at all. +CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all. -| Level-1 | S | D | C | Z | Notes | -| ---------|---|---|---|---|---------| -| xROTG | | | - | - | | -| xROTMG | | | - | - | | -| xROT | | | - | - | | -| xROTM | | | - | - | | -| xSWAP | ✔ | ✔ | ✔ | ✔ | | -| xSCAL | ✔ | ✔ | ✔ | ✔ | | -| xCOPY | ✔ | ✔ | ✔ | ✔ | | -| xAXPY | ✔ | ✔ | ✔ | ✔ | | -| xDOT | ✔ | ✔ | - | - | | -| xDOTU | - | - | ✔ | ✔ | | -| xDOTC | - | - | ✔ | ✔ | | -| xNRM2 | ✔ | ✔ | ✔ | ✔ | | -| xASUM | ✔ | ✔ | ✔ | ✔ | | -| IxAMAX | ✔ | ✔ | ✔ | ✔ | | +| Level-1 | S | D | C | Z | +| ---------|---|---|---|---| +| xSWAP | ✔ | ✔ | ✔ | ✔ | +| xSCAL | ✔ | ✔ | ✔ | ✔ | +| xCOPY | ✔ | ✔ | ✔ | ✔ | +| xAXPY | ✔ | ✔ | ✔ | ✔ | +| xDOT | ✔ | ✔ | - | - | +| xDOTU | - | - | ✔ | ✔ | +| xDOTC | - | - | ✔ | ✔ | +| xNRM2 | ✔ | ✔ | ✔ | ✔ | +| xASUM | ✔ | ✔ | ✔ | ✔ | +| IxAMAX | ✔ | ✔ | ✔ | ✔ | -| Level-2 | S | D | C | Z | Notes | -| ---------|---|---|---|---|---------| -| xGEMV | ✔ | ✔ | ✔ | ✔ | | -| xGBMV | ✔ | ✔ | ✔ | ✔ | | -| xHEMV | - | - | ✔ | ✔ | | -| xHBMV | - | - | ✔ | ✔ | | -| xHPMV | - | - | ✔ | ✔ | | -| xSYMV | ✔ | ✔ | - | - | | -| xSBMV | ✔ | ✔ | - | - | | -| xSPMV | ✔ | ✔ | - | - | | -| xTRMV | ✔ | ✔ | ✔ | ✔ | | -| xTBMV | ✔ | ✔ | ✔ | ✔ | | -| xTPMV | ✔ | ✔ | ✔ | ✔ | | -| xTRSV | | | | | | -| xTBSV | | | | | | -| xTPSV | | | | | | -| xGER | ✔ | ✔ | - | - | | -| xGERU | - | - | ✔ | ✔ | | -| xGERC | - | - | ✔ | ✔ | | -| xHER | - | - | ✔ | ✔ | | -| xHPR | - | - | ✔ | ✔ | | -| xHER2 | - | - | ✔ | ✔ | | -| xHPR2 | - | - | ✔ | ✔ | | -| xSYR | ✔ | ✔ | - | - | | -| xSPR | ✔ | ✔ | - | - | | -| xSYR2 | ✔ | ✔ | - | - | | -| xSPR2 | ✔ | ✔ | - | - | | +| Level-2 | S | D | C | Z | +| ---------|---|---|---|---| +| xGEMV | ✔ | ✔ | ✔ | ✔ | +| xGBMV | ✔ | ✔ | ✔ | ✔ | +| xHEMV | - | - | ✔ | ✔ | +| xHBMV | - | - | ✔ | ✔ | +| xHPMV | - | - | ✔ | ✔ | +| xSYMV | ✔ | ✔ | - | - | +| xSBMV | ✔ | ✔ | - | - | +| xSPMV | ✔ | ✔ | - | - | +| xTRMV | ✔ | ✔ | ✔ | ✔ | +| xTBMV | ✔ | ✔ | ✔ | ✔ | +| xTPMV | ✔ | ✔ | ✔ | ✔ | +| xGER | ✔ | ✔ | - | - | +| xGERU | - | - | ✔ | ✔ | +| xGERC | - | - | ✔ | ✔ | +| xHER | - | - | ✔ | ✔ | +| xHPR | - | - | ✔ | ✔ | +| xHER2 | - | - | ✔ | ✔ | +| xHPR2 | - | - | ✔ | ✔ | +| xSYR | ✔ | ✔ | - | - | +| xSPR | ✔ | ✔ | - | - | +| xSYR2 | ✔ | ✔ | - | - | +| xSPR2 | ✔ | ✔ | - | - | -| Level-3 | S | D | C | Z | Notes | -| ---------|---|---|---|---|---------| -| xGEMM | ✔ | ✔ | ✔ | ✔ | | -| xSYMM | ✔ | ✔ | ✔ | ✔ | | -| xHEMM | - | - | ✔ | ✔ | | -| xSYRK | ✔ | ✔ | ✔ | ✔ | | -| xHERK | - | - | ✔ | ✔ | | -| xSYR2K | ✔ | ✔ | ✔ | ✔ | | -| xHER2K | - | - | ✔ | ✔ | | -| xTRMM | ✔ | ✔ | ✔ | ✔ | | -| xTRSM | | | | | | +| Level-3 | S | D | C | Z | +| ---------|---|---|---|---| +| xGEMM | ✔ | ✔ | ✔ | ✔ | +| xSYMM | ✔ | ✔ | ✔ | ✔ | +| xHEMM | - | - | ✔ | ✔ | +| xSYRK | ✔ | ✔ | ✔ | ✔ | +| xHERK | - | - | ✔ | ✔ | +| xSYR2K | ✔ | ✔ | ✔ | ✔ | +| xHER2K | - | - | ✔ | ✔ | +| xTRMM | ✔ | ✔ | ✔ | ✔ | + +In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care: + +| Additional | S | D | C | Z | +| -----------|---|---|---|---| +| xSUM | ✔ | ✔ | ✔ | ✔ | +| IxMAX | ✔ | ✔ | ✔ | ✔ | + +Some BLAS routines are not supported yet by CLBlast. They are shown in the following table: + +| Unsupported | S | D | C | Z | +| ------------|---|---|---|---| +| xROTG | | | - | - | +| xROTMG | | | - | - | +| xROT | | | - | - | +| xROTM | | | - | - | +| xTRSV | | | | | +| xTBSV | | | | | +| xTPSV | | | | | +| xTRSM | | | | | Contributing diff --git a/include/clblast.h b/include/clblast.h index f3b74f6e..57fca119 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -188,10 +188,10 @@ StatusCode Asum(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM template StatusCode Sum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); @@ -202,7 +202,7 @@ StatusCode Amax(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX template StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, diff --git a/include/clblast_c.h b/include/clblast_c.h index 2f692b66..e23f0305 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -296,21 +296,21 @@ StatusCode PUBLIC_API CLBlastDzasum(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); -// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM StatusCode PUBLIC_API CLBlastSsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); StatusCode PUBLIC_API CLBlastDsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); StatusCode PUBLIC_API CLBlastScsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); StatusCode PUBLIC_API CLBlastDzsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); @@ -332,7 +332,7 @@ StatusCode PUBLIC_API CLBlastiZamax(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); -// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX StatusCode PUBLIC_API CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, diff --git a/include/internal/routines/level1/xmax.h b/include/internal/routines/level1/xmax.h new file mode 100644 index 00000000..860a043b --- /dev/null +++ b/include/internal/routines/level1/xmax.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xmax routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XMAX_H_ +#define CLBLAST_ROUTINES_XMAX_H_ + +#include "internal/routine.h" +#include "internal/routines/level1/xamax.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xmax: public Xamax { + public: + + // Members and methods from the base class + using Xamax::DoAmax; + + // Constructor + Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"): + Xamax(queue, event, name) { + } + + // Forwards to the regular absolute version. The implementation difference is realised in the + // kernel through a pre-processor macro based on the name of the routine. + StatusCode DoMax(const size_t n, + const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XMAX_H_ +#endif diff --git a/include/internal/routines/level1/xsum.h b/include/internal/routines/level1/xsum.h new file mode 100644 index 00000000..2f633b52 --- /dev/null +++ b/include/internal/routines/level1/xsum.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsum routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSUM_H_ +#define CLBLAST_ROUTINES_XSUM_H_ + +#include "internal/routine.h" +#include "internal/routines/level1/xasum.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsum: public Xasum { + public: + + // Members and methods from the base class + using Xasum::DoAsum; + + // Constructor + Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"): + Xasum(queue, event, name) { + } + + // Forwards to the regular absolute version. The implementation difference is realised in the + // kernel through a pre-processor macro based on the name of the routine. + StatusCode DoSum(const size_t n, + const Buffer &sum_buffer, const size_t sum_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSUM_H_ +#endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index cad9a82d..04f3c30e 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -72,9 +72,9 @@ routines = [ Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), - Routine(False, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Sum of values in a vector"), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)"), Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector"), - Routine(False, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector"), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)"), ], [ # Level 2: matrix-vector Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), @@ -298,7 +298,7 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 68, 93, 22, 29, 38] +header_lines = [84, 70, 93, 22, 29, 38] footer_lines = [13, 8, 15, 9, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index b46c3716..2fd26e79 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -73,7 +73,7 @@ class Routine(): # List of scalar buffers def ScalarBuffersFirst(self): - return ["dot","nrm2","asum","imax"] + return ["dot","nrm2","asum","sum","imax"] def ScalarBuffersSecond(self): return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] diff --git a/src/clblast.cc b/src/clblast.cc index 4f2e6fb5..fac5a539 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -29,7 +29,9 @@ #include "internal/routines/level1/xdotc.h" #include "internal/routines/level1/xnrm2.h" #include "internal/routines/level1/xasum.h" +#include "internal/routines/level1/xsum.h" // non-BLAS function #include "internal/routines/level1/xamax.h" +#include "internal/routines/level1/xmax.h" // non-BLAS function // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" @@ -430,13 +432,19 @@ template StatusCode PUBLIC_API Asum(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Sum of values in a vector: SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM template -StatusCode Sum(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Sum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xsum(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoSum(n, + Buffer(sum_buffer), sum_offset, + Buffer(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, @@ -486,13 +494,19 @@ template StatusCode PUBLIC_API Amax(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Index of maximum value in a vector: iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX template -StatusCode Max(const size_t, - cl_mem, const size_t, - const cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Max(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xmax(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoMax(n, + Buffer(imax_buffer), imax_offset, + Buffer(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, diff --git a/src/clblast_c.cc b/src/clblast_c.cc index e6270d57..72d93c4b 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -477,41 +477,41 @@ StatusCode CLBlastDzasum(const size_t n, // SUM StatusCode CLBlastSsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast(status); } StatusCode CLBlastDsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast(status); } StatusCode CLBlastScsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast(status); } StatusCode CLBlastDzsum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, + cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { auto status = clblast::Sum(n, - asum_buffer, asum_offset, + sum_buffer, sum_offset, x_buffer, x_offset, x_inc, queue, event); return static_cast(status); diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 57d75ee0..d401744d 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -40,6 +40,7 @@ R"( typedef float16 real16; #define ZERO 0.0f #define ONE 1.0f + #define SMALLEST -1.0e37f // Double-precision #elif PRECISION == 64 @@ -50,6 +51,7 @@ R"( typedef double16 real16; #define ZERO 0.0 #define ONE 1.0 + #define SMALLEST -1.0e37 // Complex single-precision #elif PRECISION == 3232 @@ -64,6 +66,7 @@ R"( real sC; real sD; real sE; real sF;} real16; #define ZERO 0.0f #define ONE 1.0f + #define SMALLEST -1.0e37f // Complex Double-precision #elif PRECISION == 6464 @@ -78,6 +81,7 @@ R"( real sC; real sD; real sE; real sF;} real16; #define ZERO 0.0 #define ONE 1.0 + #define SMALLEST -1.0e37 #endif // Single-element version of a complex number diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 03dd05e5..58b75ce2 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -41,14 +41,23 @@ __kernel void Xamax(const int n, const int num_groups = get_num_groups(0); // Performs loading and the first steps of the reduction - singlereal max = ZERO; + #if defined(ROUTINE_MAX) // non-absolute version + singlereal max = SMALLEST; + #else + singlereal max = ZERO; + #endif unsigned int imax = 0; int id = wgid*WGS1 + lid; while (id < n) { + const int x_index = id*x_inc + x_offset; #if PRECISION == 3232 || PRECISION == 6464 - singlereal x = fabs(xgm[id*x_inc + x_offset].x); + singlereal x = xgm[x_index].x; #else - singlereal x = fabs(xgm[id*x_inc + x_offset]); + singlereal x = xgm[x_index]; + #endif + #if defined(ROUTINE_MAX) // non-absolute version + #else + x = fabs(x); #endif if (x >= max) { max = x; diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl index 037dc57e..58d0f11b 100644 --- a/src/kernels/level1/xasum.opencl +++ b/src/kernels/level1/xasum.opencl @@ -45,7 +45,10 @@ __kernel void Xasum(const int n, int id = wgid*WGS1 + lid; while (id < n) { real x = xgm[id*x_inc + x_offset]; - AbsoluteValue(x); + #if defined(ROUTINE_SUM) // non-absolute version + #else + AbsoluteValue(x); + #endif Add(acc, acc, x); id += WGS1*num_groups; } From d9b21d7f4920b115d3fe7f2e3cce1f89eb762c10 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 28 Apr 2016 21:14:17 +0200 Subject: [PATCH 38/60] Fixed the cache to store binaries instead of OpenCL programs --- CHANGELOG | 2 +- include/clblast.h | 2 +- include/clblast_c.h | 2 +- include/internal/cache.h | 30 +++++++++---------- include/internal/clpp11.h | 25 +++++++++++++--- include/internal/routine.h | 16 ++++++---- src/cache.cc | 56 +++++++++++++++++------------------ src/clblast.cc | 4 +-- src/clblast_c.cc | 6 ++-- src/routine.cc | 5 ++-- src/routines/level1/xamax.cc | 2 +- src/routines/level1/xasum.cc | 2 +- src/routines/level1/xaxpy.cc | 2 +- src/routines/level1/xcopy.cc | 2 +- src/routines/level1/xdot.cc | 2 +- src/routines/level1/xnrm2.cc | 2 +- src/routines/level1/xscal.cc | 2 +- src/routines/level1/xswap.cc | 2 +- src/routines/level2/xgemv.cc | 2 +- src/routines/level2/xger.cc | 2 +- src/routines/level2/xher.cc | 2 +- src/routines/level2/xher2.cc | 2 +- src/routines/level3/xgemm.cc | 2 +- src/routines/level3/xhemm.cc | 2 +- src/routines/level3/xher2k.cc | 2 +- src/routines/level3/xherk.cc | 2 +- src/routines/level3/xsymm.cc | 2 +- src/routines/level3/xsyr2k.cc | 2 +- src/routines/level3/xsyrk.cc | 2 +- src/routines/level3/xtrmm.cc | 2 +- 30 files changed, 105 insertions(+), 83 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 787793f0..6dc1ed49 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,7 +5,7 @@ Development version (next release) - Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries - Fixed the use of events within the library - Changed the enum parameters to match the raw values of the cblas standard -- Added a function to clear the cache of previously compiled programs +- Fixed the cache of previously compiled binaries and added a function to clear it - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 * SASUM/DASUM/ScASUM/DzASUM diff --git a/include/clblast.h b/include/clblast.h index 57fca119..e473adbe 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -556,7 +556,7 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. -StatusCode ClearCompiledProgramCache(); +StatusCode ClearCache(); // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index e23f0305..45e50cff 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -1074,7 +1074,7 @@ StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const T // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. -StatusCode PUBLIC_API CLBlastClearCompiledProgramCache(); +StatusCode PUBLIC_API CLBlastClearCache(); // ================================================================================================= diff --git a/include/internal/cache.h b/include/internal/cache.h index 44fad68d..fa33b78f 100644 --- a/include/internal/cache.h +++ b/include/internal/cache.h @@ -24,9 +24,9 @@ namespace clblast { namespace cache { // ================================================================================================= -// The cache of compiled OpenCL programs, along with some meta-data -struct ProgramCache { - Program program; +// The cache of compiled OpenCL binaries, along with some meta-data +struct BinaryCache { + std::string binary; std::string device_name; Precision precision; std::string routine_name_; @@ -41,28 +41,28 @@ struct ProgramCache { }; // The actual cache, implemented as a vector of the above data-type, and its mutex -static std::vector program_cache_; -static std::mutex program_cache_mutex_; +static std::vector binary_cache_; +static std::mutex binary_cache_mutex_; // ================================================================================================= -// Stores the compiled program in the cache -void StoreProgramToCache(const Program& program, const std::string &device_name, - const Precision &precision, const std::string &routine_name); +// Stores the compiled binary in the cache +void StoreBinaryToCache(const std::string& binary, const std::string &device_name, + const Precision &precision, const std::string &routine_name); -// Queries the cache and retrieves a matching program. Assumes that the match is available, throws +// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws // otherwise. -const Program& GetProgramFromCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name); +const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name); // Queries the cache to see whether or not the compiled kernel is already there -bool ProgramIsInCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name); +bool BinaryIsInCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name); // ================================================================================================= -// Clears the cache of stored program binaries -StatusCode ClearCompiledProgramCache(); +// Clears the cache of stored binaries +StatusCode ClearCache(); // ================================================================================================= } // namespace cache diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index 543d423a..b865ab1e 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -283,7 +283,7 @@ class Program { public: // Note that there is no constructor based on the regular OpenCL data-type because of extra state - // Regular constructor with memory management + // Source-based constructor with memory management explicit Program(const Context &context, std::string source): program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), length_(source.length()), @@ -294,6 +294,22 @@ class Program { CheckError(status); } + // Binary-based constructor with memory management + explicit Program(const Device &device, const Context &context, const std::string& binary): + program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), + length_(binary.length()), + source_(binary), + source_ptr_(&source_[0]) { + auto status1 = CL_SUCCESS; + auto status2 = CL_SUCCESS; + const cl_device_id dev = device(); + *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_, + reinterpret_cast(&source_ptr_), + &status1, &status2); + CheckError(status1); + CheckError(status2); + } + // Compiles the device program and returns whether or not there where any warnings/errors BuildStatus Build(const Device &device, std::vector &options) { auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); @@ -322,7 +338,7 @@ class Program { return result; } - // Retrieves an intermediate representation of the compiled program + // Retrieves a binary or an intermediate representation of the compiled program std::string GetIR() const { auto bytes = size_t{0}; CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); @@ -338,7 +354,7 @@ class Program { private: std::shared_ptr program_; size_t length_; - std::string source_; + std::string source_; // Note: the source can also be a binary or IR const char* source_ptr_; }; @@ -633,7 +649,8 @@ class Kernel { // Launches the kernel while waiting for other events CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast(global.size()), nullptr, global.data(), local.data(), - waitForEventsPlain.size(), waitForEventsPlain.data(), + static_cast(waitForEventsPlain.size()), + waitForEventsPlain.data(), event)); } diff --git a/include/internal/routine.h b/include/internal/routine.h index 013769d8..32be6012 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -84,19 +84,23 @@ class Routine { const bool upper = false, const bool lower = false, const bool diagonal_imag_zero = false); - // Stores a newly compiled program into the cache - void StoreProgramToCache(const Program& program) const { - return cache::StoreProgramToCache(program, device_name_, precision_, routine_name_); + // Stores a newly compiled binary into the cache + void StoreBinaryToCache(const std::string& binary) const { + return cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_); } // Queries the cache and retrieve either a matching program or a boolean whether a match exists. // The first assumes that the program is available in the cache and will throw an exception // otherwise. - const Program& GetProgramFromCache() const { - return cache::GetProgramFromCache(device_name_, precision_, routine_name_); + Program GetProgramFromCache() const { + auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_); + auto program = Program(device_, context_, binary); + auto options = std::vector(); + program.Build(device_, options); + return program; } bool ProgramIsInCache() const { - return cache::ProgramIsInCache(device_name_, precision_, routine_name_); + return cache::BinaryIsInCache(device_name_, precision_, routine_name_); } // Non-static variable for the precision. Note that the same variable (but static) might exist in diff --git a/src/cache.cc b/src/cache.cc index beeb1b35..18731a51 100644 --- a/src/cache.cc +++ b/src/cache.cc @@ -21,50 +21,50 @@ namespace clblast { namespace cache { // ================================================================================================= -// Stores the compiled program in the cache -void StoreProgramToCache(const Program& program, const std::string &device_name, - const Precision &precision, const std::string &routine_name) { - program_cache_mutex_.lock(); - program_cache_.push_back({program, device_name, precision, routine_name}); - program_cache_mutex_.unlock(); +// Stores the compiled binary or IR in the cache +void StoreBinaryToCache(const std::string& binary, const std::string &device_name, + const Precision &precision, const std::string &routine_name) { + binary_cache_mutex_.lock(); + binary_cache_.push_back({binary, device_name, precision, routine_name}); + binary_cache_mutex_.unlock(); } -// Queries the cache and retrieves a matching program. Assumes that the match is available, throws +// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws // otherwise. -const Program& GetProgramFromCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name) { - program_cache_mutex_.lock(); - for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(device_name, precision, routine_name)) { - program_cache_mutex_.unlock(); - return cached_program.program; +const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name) { + binary_cache_mutex_.lock(); + for (auto &cached_binary: binary_cache_) { + if (cached_binary.MatchInCache(device_name, precision, routine_name)) { + binary_cache_mutex_.unlock(); + return cached_binary.binary; } } - program_cache_mutex_.unlock(); - throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); + binary_cache_mutex_.unlock(); + throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none."); } // Queries the cache to see whether or not the compiled kernel is already there -bool ProgramIsInCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name) { - program_cache_mutex_.lock(); - for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(device_name, precision, routine_name)) { - program_cache_mutex_.unlock(); +bool BinaryIsInCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name) { + binary_cache_mutex_.lock(); + for (auto &cached_binary: binary_cache_) { + if (cached_binary.MatchInCache(device_name, precision, routine_name)) { + binary_cache_mutex_.unlock(); return true; } } - program_cache_mutex_.unlock(); + binary_cache_mutex_.unlock(); return false; } // ================================================================================================= -// Clears the cache of stored program binaries -StatusCode ClearCompiledProgramCache() { - program_cache_mutex_.lock(); - program_cache_.clear(); - program_cache_mutex_.unlock(); +// Clears the cache of stored binaries +StatusCode ClearCache() { + binary_cache_mutex_.lock(); + binary_cache_.clear(); + binary_cache_mutex_.unlock(); return StatusCode::kSuccess; } diff --git a/src/clblast.cc b/src/clblast.cc index fac5a539..fe79d7c1 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -1854,8 +1854,8 @@ template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Tri // ================================================================================================= -// Clears the cache of stored program binaries -StatusCode ClearCompiledProgramCache() { return cache::ClearCompiledProgramCache(); } +// Clears the cache of stored binaries +StatusCode ClearCache() { return cache::ClearCache(); } // ================================================================================================= } // namespace clblast diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 72d93c4b..172bce64 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -2343,9 +2343,9 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri // ================================================================================================= -// Clears the cache of stored program binaries -StatusCode CLBlastClearCompiledProgramCache() { - return static_cast(clblast::ClearCompiledProgramCache()); +// Clears the cache of stored binaries +StatusCode CLBlastClearCache() { + return static_cast(clblast::ClearCache()); } // ================================================================================================= diff --git a/src/routine.cc b/src/routine.cc index e0a75e41..cd4d82fb 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -96,8 +96,9 @@ StatusCode Routine::SetUp() { } if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } - // Store the compiled program in the cache (atomic for thread-safety) - StoreProgramToCache(program); + // Store the compiled kernel in the cache + auto binary = program.GetIR(); + StoreBinaryToCache(binary); } catch (...) { return StatusCode::kBuildProgramFailure; } } diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc index ffdfa496..33bd72a6 100644 --- a/src/routines/level1/xamax.cc +++ b/src/routines/level1/xamax.cc @@ -55,7 +55,7 @@ StatusCode Xamax::DoAmax(const size_t n, // Retrieves the Xamax kernels from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel1 = Kernel(program, "Xamax"); auto kernel2 = Kernel(program, "XamaxEpilogue"); diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc index 5799e25a..ea33d7e1 100644 --- a/src/routines/level1/xasum.cc +++ b/src/routines/level1/xasum.cc @@ -55,7 +55,7 @@ StatusCode Xasum::DoAsum(const size_t n, // Retrieves the Xasum kernels from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel1 = Kernel(program, "Xasum"); auto kernel2 = Kernel(program, "XasumEpilogue"); diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index 37d23543..96809a57 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -64,7 +64,7 @@ StatusCode Xaxpy::DoAxpy(const size_t n, const T alpha, // Retrieves the Xaxpy kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc index 04508383..d34482ce 100644 --- a/src/routines/level1/xcopy.cc +++ b/src/routines/level1/xcopy.cc @@ -64,7 +64,7 @@ StatusCode Xcopy::DoCopy(const size_t n, // Retrieves the Xcopy kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc index 4813a004..b2513485 100644 --- a/src/routines/level1/xdot.cc +++ b/src/routines/level1/xdot.cc @@ -59,7 +59,7 @@ StatusCode Xdot::DoDot(const size_t n, // Retrieves the Xdot kernels from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel1 = Kernel(program, "Xdot"); auto kernel2 = Kernel(program, "XdotEpilogue"); diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index ceabe586..86166a0c 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -55,7 +55,7 @@ StatusCode Xnrm2::DoNrm2(const size_t n, // Retrieves the Xnrm2 kernels from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel1 = Kernel(program, "Xnrm2"); auto kernel2 = Kernel(program, "Xnrm2Epilogue"); diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc index e83e73fd..b92e2cdf 100644 --- a/src/routines/level1/xscal.cc +++ b/src/routines/level1/xscal.cc @@ -60,7 +60,7 @@ StatusCode Xscal::DoScal(const size_t n, const T alpha, // Retrieves the Xscal kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc index bc425f40..bfc4a739 100644 --- a/src/routines/level1/xswap.cc +++ b/src/routines/level1/xswap.cc @@ -64,7 +64,7 @@ StatusCode Xswap::DoSwap(const size_t n, // Retrieves the Xswap kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index 24e87db0..f8985038 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -136,7 +136,7 @@ StatusCode Xgemv::MatVec(const Layout layout, const Transpose a_transpose, // Retrieves the Xgemv kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index dda78232..686c7e60 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -66,7 +66,7 @@ StatusCode Xger::DoGer(const Layout layout, // Retrieves the Xgemv kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xger"); // Sets the kernel arguments diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index aba665b0..a7116213 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -79,7 +79,7 @@ StatusCode Xher::DoHer(const Layout layout, const Triangle triangle, // Retrieves the Xgemv kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xher"); // Sets the kernel arguments diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index bcd6488f..3fd1a961 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -68,7 +68,7 @@ StatusCode Xher2::DoHer2(const Layout layout, const Triangle triangle, // Retrieves the Xgemv kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xher2"); // Sets the kernel arguments diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 7557dcc3..aa081e81 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -107,7 +107,7 @@ StatusCode Xgemm::DoGemm(const Layout layout, try { // Loads the program from the database - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 && diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index c0a4306a..d2fbf36e 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -61,7 +61,7 @@ StatusCode Xhemm::DoHemm(const Layout layout, const Side side, const Triangle // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm // routine afterwards try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the arguments for the hermitian-to-squared kernel diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 4d5a4d35..2c2c815d 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -93,7 +93,7 @@ StatusCode Xher2k::DoHer2k(const Layout layout, const Triangle triangle, co try { // Loads the program from the database - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); // Determines whether or not temporary matrices are needed auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 574debe4..414c4760 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -90,7 +90,7 @@ StatusCode Xherk::DoHerk(const Layout layout, const Triangle triangle, cons try { // Loads the program from the database - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index 914a326a..a39026f1 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -61,7 +61,7 @@ StatusCode Xsymm::DoSymm(const Layout layout, const Side side, const Triangle // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm // routine afterwards try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the arguments for the symmetric-to-squared kernel diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 44d0024e..3206c669 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -91,7 +91,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons try { // Loads the program from the database - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); // Determines whether or not temporary matrices are needed auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index 44ed8d35..741ad064 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -87,7 +87,7 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const try { // Loads the program from the database - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 484cf040..9e3b27b4 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -63,7 +63,7 @@ StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm // routine afterwards try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the arguments for the triangular-to-squared kernel From 4f528b1730df1ffda9d396030cfc4c4ddf0203fb Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 29 Apr 2016 20:33:19 +0200 Subject: [PATCH 39/60] Added sample C programs for the SASUM and DGEMV routines --- CMakeLists.txt | 2 +- samples/dgemv.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++ samples/sasum.c | 96 ++++++++++++++++++++++++++++++++++++++++++ samples/sgemm.c | 8 ++-- samples/sgemm.cc | 6 +-- 5 files changed, 210 insertions(+), 8 deletions(-) create mode 100644 samples/dgemv.c create mode 100644 samples/sasum.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 6abfc09f..39274ab9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -120,7 +120,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS}) # Sets the supported routines and the used kernels. New routines and kernels should be added here. set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) -set(SAMPLE_PROGRAMS_C sgemm) +set(SAMPLE_PROGRAMS_C sasum dgemv sgemm) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) diff --git a/samples/dgemv.c b/samples/dgemv.c new file mode 100644 index 00000000..6ea0deb0 --- /dev/null +++ b/samples/dgemv.c @@ -0,0 +1,106 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file demonstrates the use of the DGEMV routine. It is pure C99 and demonstrates the use of +// the C API to the CLBlast library. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include +#include +#include + +// Includes the CLBlast library (C interface) +#include + +// ================================================================================================= + +// Example use of the double-precision routine DGEMV +int main(void) { + + // OpenCL platform/device settings + const size_t platform_id = 0; + const size_t device_id = 0; + + // Example DGEMV arguments + const size_t m = 128; + const size_t n = 289; + const double alpha = 0.7; + const double beta = 0.0; + const size_t a_ld = n; + + // Initializes the OpenCL platform + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + cl_platform_id platform = platforms[platform_id]; + + // Initializes the OpenCL device + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + cl_device_id device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); + cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); + cl_event event = NULL; + + // Populate host data structures with some example data + double* host_a = (double*)malloc(sizeof(double)*m*n); + double* host_x = (double*)malloc(sizeof(double)*n); + double* host_y = (double*)malloc(sizeof(double)*m); + for (size_t i=0; i success). + printf("Completed DGEMV with status %d\n", status); + + // Clean-up + free(platforms); + free(devices); + free(host_a); + free(host_x); + free(host_y); + clReleaseMemObject(device_a); + clReleaseMemObject(device_x); + clReleaseMemObject(device_y); + clReleaseCommandQueue(queue); + clReleaseContext(context); + return 0; +} + +// ================================================================================================= diff --git a/samples/sasum.c b/samples/sasum.c new file mode 100644 index 00000000..3b20d301 --- /dev/null +++ b/samples/sasum.c @@ -0,0 +1,96 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file demonstrates the use of the SASUM routine. It is pure C99 and demonstrates the use of +// the C API to the CLBlast library. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include +#include +#include + +// Includes the CLBlast library (C interface) +#include + +// ================================================================================================= + +// Example use of the single-precision routine SASUM +int main(void) { + + // OpenCL platform/device settings + const size_t platform_id = 0; + const size_t device_id = 0; + + // Example SASUM arguments + const size_t n = 1000; + const float input_value = -1.5f; + + // Initializes the OpenCL platform + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + cl_platform_id platform = platforms[platform_id]; + + // Initializes the OpenCL device + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + cl_device_id device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); + cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); + cl_event event = NULL; + + // Populate host data structures with some example data + float* host_input = (float*)malloc(sizeof(float)*n); + float* host_output = (float*)malloc(sizeof(float)*1); + for (size_t i=0; i success). + printf("Completed SASUM with status %d: %d * |%.1lf| = %.1lf\n", status, n, input_value, host_output[0]); + + // Clean-up + free(platforms); + free(devices); + free(host_input); + free(host_output); + clReleaseMemObject(device_input); + clReleaseMemObject(device_output); + clReleaseCommandQueue(queue); + clReleaseContext(context); + return 0; +} + +// ================================================================================================= diff --git a/samples/sgemm.c b/samples/sgemm.c index d528db0a..79f30c83 100644 --- a/samples/sgemm.c +++ b/samples/sgemm.c @@ -48,11 +48,11 @@ int main(void) { clGetPlatformIDs(num_platforms, platforms, NULL); cl_platform_id platform = platforms[platform_id]; - // Initializes the OpenCL device (note: example for GPU devices only) + // Initializes the OpenCL device cl_uint num_devices; - clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); - clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); cl_device_id device = devices[device_id]; // Creates the OpenCL context, queue, and an event @@ -90,7 +90,7 @@ int main(void) { clWaitForEvents(1, &event); // Example completed. See "clblast_c.h" for status codes (0 -> success). - printf("Completed with status %d\n", status); + printf("Completed SGEMM with status %d\n", status); // Clean-up free(platforms); diff --git a/samples/sgemm.cc b/samples/sgemm.cc index 2659d36c..5fe7490a 100644 --- a/samples/sgemm.cc +++ b/samples/sgemm.cc @@ -52,9 +52,9 @@ int main() { if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } auto platform = platforms[platform_id]; - // Initializes the OpenCL device (note: example for GPU devices only) + // Initializes the OpenCL device auto devices = std::vector(); - platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); + platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); if (devices.size() == 0 || device_id >= devices.size()) { return 1; } auto device = devices[device_id]; @@ -100,7 +100,7 @@ int main() { auto time_ms = std::chrono::duration(elapsed_time).count(); // Example completed. See "clblast.h" for status codes (0 -> success). - printf("Completed in %.3lf ms with status %d\n", time_ms, status); + printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, status); return 0; } From 877aad693f5bf57f8c37ded9e2acab014a4f039b Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 29 Apr 2016 23:33:12 +0200 Subject: [PATCH 40/60] Added FillCache: a function to pre-compile all kernels for a specific device --- include/clblast.h | 4 +++ include/clblast_c.h | 4 +++ scripts/generator/generator.py | 2 +- src/clblast.cc | 62 ++++++++++++++++++++++++++++++++++ src/clblast_c.cc | 5 +++ 5 files changed, 76 insertions(+), 1 deletion(-) diff --git a/include/clblast.h b/include/clblast.h index e473adbe..075ca93e 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -558,6 +558,10 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c // for the same device. This cache can be cleared to free up system memory or in case of debugging. StatusCode ClearCache(); +// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels. +// Further CLBlast routine calls will then run at maximum speed. +StatusCode FillCache(const cl_device_id device); + // ================================================================================================= } // namespace clblast diff --git a/include/clblast_c.h b/include/clblast_c.h index 45e50cff..dd9b0f67 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -1076,6 +1076,10 @@ StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const T // for the same device. This cache can be cleared to free up system memory or in case of debugging. StatusCode PUBLIC_API CLBlastClearCache(); +// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels. +// Further CLBlast routine calls will then run at maximum speed. +StatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device); + // ================================================================================================= #ifdef __cplusplus diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 04f3c30e..a9419f13 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -299,7 +299,7 @@ files = [ path_clblast+"/test/wrapper_cblas.h", ] header_lines = [84, 70, 93, 22, 29, 38] -footer_lines = [13, 8, 15, 9, 6, 6] +footer_lines = [17, 70, 19, 14, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise for f in files: diff --git a/src/clblast.cc b/src/clblast.cc index fe79d7c1..a5bb6b67 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -1857,5 +1857,67 @@ template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Tri // Clears the cache of stored binaries StatusCode ClearCache() { return cache::ClearCache(); } +// Fills the cache with all binaries for a specific device +StatusCode FillCache(const cl_device_id device) { + try { + + // Creates a sample context and queue to match the normal routine calling conventions + auto device_cpp = Device(device); + auto context = Context(device_cpp); + auto queue = Queue(context, device_cpp); + + // Runs all the level 1 set-up functions + Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); + Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); Xswap(queue, nullptr).SetUp(); + Xscal(queue, nullptr).SetUp(); Xscal(queue, nullptr).SetUp(); Xscal(queue, nullptr).SetUp(); Xscal(queue, nullptr).SetUp(); + Xcopy(queue, nullptr).SetUp(); Xcopy(queue, nullptr).SetUp(); Xcopy(queue, nullptr).SetUp(); Xcopy(queue, nullptr).SetUp(); + Xaxpy(queue, nullptr).SetUp(); Xaxpy(queue, nullptr).SetUp(); Xaxpy(queue, nullptr).SetUp(); Xaxpy(queue, nullptr).SetUp(); + Xdot(queue, nullptr).SetUp(); Xdot(queue, nullptr).SetUp(); + Xdotu(queue, nullptr).SetUp(); Xdotu(queue, nullptr).SetUp(); + Xdotc(queue, nullptr).SetUp(); Xdotc(queue, nullptr).SetUp(); + Xnrm2(queue, nullptr).SetUp(); Xnrm2(queue, nullptr).SetUp(); Xnrm2(queue, nullptr).SetUp(); Xnrm2(queue, nullptr).SetUp(); + Xasum(queue, nullptr).SetUp(); Xasum(queue, nullptr).SetUp(); Xasum(queue, nullptr).SetUp(); Xasum(queue, nullptr).SetUp(); + Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); + Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); + Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); + + // Runs all the level 2 set-up functions + Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); + Xgbmv(queue, nullptr).SetUp(); Xgbmv(queue, nullptr).SetUp(); Xgbmv(queue, nullptr).SetUp(); Xgbmv(queue, nullptr).SetUp(); + Xhemv(queue, nullptr).SetUp(); Xhemv(queue, nullptr).SetUp(); + Xhbmv(queue, nullptr).SetUp(); Xhbmv(queue, nullptr).SetUp(); + Xhpmv(queue, nullptr).SetUp(); Xhpmv(queue, nullptr).SetUp(); + Xsymv(queue, nullptr).SetUp(); Xsymv(queue, nullptr).SetUp(); + Xsbmv(queue, nullptr).SetUp(); Xsbmv(queue, nullptr).SetUp(); + Xspmv(queue, nullptr).SetUp(); Xspmv(queue, nullptr).SetUp(); + Xtrmv(queue, nullptr).SetUp(); Xtrmv(queue, nullptr).SetUp(); Xtrmv(queue, nullptr).SetUp(); Xtrmv(queue, nullptr).SetUp(); + Xtbmv(queue, nullptr).SetUp(); Xtbmv(queue, nullptr).SetUp(); Xtbmv(queue, nullptr).SetUp(); Xtbmv(queue, nullptr).SetUp(); + Xtpmv(queue, nullptr).SetUp(); Xtpmv(queue, nullptr).SetUp(); Xtpmv(queue, nullptr).SetUp(); Xtpmv(queue, nullptr).SetUp(); + Xger(queue, nullptr).SetUp(); Xger(queue, nullptr).SetUp(); + Xgeru(queue, nullptr).SetUp(); Xgeru(queue, nullptr).SetUp(); + Xgerc(queue, nullptr).SetUp(); Xgerc(queue, nullptr).SetUp(); + Xher(queue, nullptr).SetUp(); Xher(queue, nullptr).SetUp(); + Xhpr(queue, nullptr).SetUp(); Xhpr(queue, nullptr).SetUp(); + Xher2(queue, nullptr).SetUp(); Xher2(queue, nullptr).SetUp(); + Xhpr2(queue, nullptr).SetUp(); Xhpr2(queue, nullptr).SetUp(); + Xsyr(queue, nullptr).SetUp(); Xsyr(queue, nullptr).SetUp(); + Xspr(queue, nullptr).SetUp(); Xspr(queue, nullptr).SetUp(); + Xsyr2(queue, nullptr).SetUp(); Xsyr2(queue, nullptr).SetUp(); + Xspr2(queue, nullptr).SetUp(); Xspr2(queue, nullptr).SetUp(); + + // Runs all the level 1 set-up functions + Xgemm(queue, nullptr).SetUp(); Xgemm(queue, nullptr).SetUp(); Xgemm(queue, nullptr).SetUp(); Xgemm(queue, nullptr).SetUp(); + Xsymm(queue, nullptr).SetUp(); Xsymm(queue, nullptr).SetUp(); Xsymm(queue, nullptr).SetUp(); Xsymm(queue, nullptr).SetUp(); + Xhemm(queue, nullptr).SetUp(); Xhemm(queue, nullptr).SetUp(); + Xsyrk(queue, nullptr).SetUp(); Xsyrk(queue, nullptr).SetUp(); Xsyrk(queue, nullptr).SetUp(); Xsyrk(queue, nullptr).SetUp(); + Xherk(queue, nullptr).SetUp(); Xherk(queue, nullptr).SetUp(); + Xsyr2k(queue, nullptr).SetUp(); Xsyr2k(queue, nullptr).SetUp(); Xsyr2k(queue, nullptr).SetUp(); Xsyr2k(queue, nullptr).SetUp(); + Xher2k(queue, nullptr).SetUp(); Xher2k(queue, nullptr).SetUp(); + Xtrmm(queue, nullptr).SetUp(); Xtrmm(queue, nullptr).SetUp(); Xtrmm(queue, nullptr).SetUp(); Xtrmm(queue, nullptr).SetUp(); + + } catch (...) { return StatusCode::kBuildProgramFailure; } + return StatusCode::kSuccess; +} + // ================================================================================================= } // namespace clblast diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 172bce64..47ab1798 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -2348,4 +2348,9 @@ StatusCode CLBlastClearCache() { return static_cast(clblast::ClearCache()); } +// Fills the cache with binaries for a specific device +StatusCode CLBlastFillCache(const cl_device_id device) { + return static_cast(clblast::FillCache(device)); +} + // ================================================================================================= From 2952390f27c07500bd2a24b5e6fdce5e282fc8dd Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Fri, 29 Apr 2016 23:33:36 +0200 Subject: [PATCH 41/60] Added an example to demonstrate the use of the ClearCache and FillCache functions --- CMakeLists.txt | 2 +- samples/cache.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 samples/cache.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 39274ab9..d63105e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -120,7 +120,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS}) # Sets the supported routines and the used kernels. New routines and kernels should be added here. set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) -set(SAMPLE_PROGRAMS_C sasum dgemv sgemm) +set(SAMPLE_PROGRAMS_C sasum dgemv sgemm cache) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) diff --git a/samples/cache.c b/samples/cache.c new file mode 100644 index 00000000..7f876be1 --- /dev/null +++ b/samples/cache.c @@ -0,0 +1,133 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file demonstrates the CLBlast kernel cache, which stores compiled OpenCL binaries for faster +// repeated kernel execution. The cache can be pre-initialized or cleared. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include +#include +#include +#include + +// Includes the CLBlast library (C interface) +#include + +// Forward declaration +void run_example_routine(const cl_device_id device); + +// ================================================================================================= + +// Example use of the CLBlast kernel cache +int main(void) { + + // OpenCL platform/device settings + const size_t platform_id = 0; + const size_t device_id = 0; + + // Initializes the OpenCL platform + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + cl_platform_id platform = platforms[platform_id]; + + // Initializes the OpenCL device + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + cl_device_id device = devices[device_id]; + + // Run the routine multiple times in a row: after the first time the binary is already in the + // cache and compilation is no longer needed. + printf("Starting caching sample with an empty cache\n"); + run_example_routine(device); + run_example_routine(device); + run_example_routine(device); + + // Clearing the cache makes CLBlast re-compile the kernel once + printf("Clearing cache\n"); + CLBlastClearCache(); + run_example_routine(device); + run_example_routine(device); + + // When the cache is empty, it can be pre-initialized with compiled kernels for all routines by + // calling the CLBlastFillCache function, such that all other CLBlast calls can benefit from + // pre-compiled kernels and thus execute at maximum speed. + printf("Clearing cache\n"); + CLBlastClearCache(); + printf("Filling cache (this might take a while)\n"); + CLBlastFillCache(device); + run_example_routine(device); + + // Clean-up + free(platforms); + free(devices); + return 0; +} + +// ================================================================================================= + +// Runs an example routine and reports the time +void run_example_routine(const cl_device_id device) { + + // Example SASUM arguments + const size_t n = 1024*128; + + // Creates the OpenCL context, queue, and an event + cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); + cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); + cl_event event = NULL; + + // Populate host data structures with some example data + float* host_input = (float*)malloc(sizeof(float)*n); + float* host_output = (float*)malloc(sizeof(float)*1); + for (size_t i=0; i success). + printf("Completed routine with status %d in %.3lf ms\n", status, time_ms); + + // Clean-up + free(host_input); + free(host_output); + clReleaseMemObject(device_input); + clReleaseMemObject(device_output); + clReleaseCommandQueue(queue); + clReleaseContext(context); +} + +// ================================================================================================= From e113ff0852d21ecb898b3b192145b70cad3f338a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 30 Apr 2016 09:49:39 +0200 Subject: [PATCH 42/60] Added non-aboslute minimum counter-part IxMIN of the BLAS routine IxAMAX --- CHANGELOG | 4 +- README.md | 1 + include/clblast.h | 7 ++++ include/clblast_c.h | 18 +++++++++ include/internal/routines/level1/xmin.h | 49 +++++++++++++++++++++++++ scripts/generator/generator.py | 5 ++- scripts/generator/routine.py | 2 +- src/clblast.cc | 33 +++++++++++++++++ src/clblast_c.cc | 42 +++++++++++++++++++++ src/kernels/level1/xamax.opencl | 7 +++- 10 files changed, 162 insertions(+), 6 deletions(-) create mode 100644 include/internal/routines/level1/xmin.h diff --git a/CHANGELOG b/CHANGELOG index 6dc1ed49..f68c2483 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,13 +5,15 @@ Development version (next release) - Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries - Fixed the use of events within the library - Changed the enum parameters to match the raw values of the cblas standard -- Fixed the cache of previously compiled binaries and added a function to clear it +- Fixed the cache of previously compiled binaries and added a function to fill or clear it +- Added additional sample programs - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 * SASUM/DASUM/ScASUM/DzASUM * SSUM/DSUM/ScSUM/DzSUM (non-absolute version of the above xASUM BLAS routines) * iSAMAX/iDAMAX/iCAMAX/iZAMAX * iSMAX/iDMAX/iCMAX/iZMAX (non-absolute version of the above ixAMAX BLAS routines) + * iSMIN/iDMIN/iCMIN/iZMIN (non-absolute minimum version of the above ixAMAX BLAS routines) Version 0.6.0 - Added support for MSVC (Visual Studio) 2015 diff --git a/README.md b/README.md index f2a85efc..0f7b7d3c 100644 --- a/README.md +++ b/README.md @@ -226,6 +226,7 @@ In addition, some non-BLAS routines are also supported by CLBlast. They are expe | -----------|---|---|---|---| | xSUM | ✔ | ✔ | ✔ | ✔ | | IxMAX | ✔ | ✔ | ✔ | ✔ | +| IxMIN | ✔ | ✔ | ✔ | ✔ | Some BLAS routines are not supported yet by CLBlast. They are shown in the following table: diff --git a/include/clblast.h b/include/clblast.h index 075ca93e..5df0f605 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -209,6 +209,13 @@ StatusCode Max(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +template +StatusCode Min(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event = nullptr); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index dd9b0f67..8b2bf73c 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -350,6 +350,24 @@ StatusCode PUBLIC_API CLBlastiZmax(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +StatusCode PUBLIC_API CLBlastiSmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiDmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiCmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiZmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/include/internal/routines/level1/xmin.h b/include/internal/routines/level1/xmin.h new file mode 100644 index 00000000..4c99a5ad --- /dev/null +++ b/include/internal/routines/level1/xmin.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xmin routine. The precision is implemented using a template argument. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XMIN_H_ +#define CLBLAST_ROUTINES_XMIN_H_ + +#include "internal/routine.h" +#include "internal/routines/level1/xamax.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xmin: public Xamax { + public: + + // Members and methods from the base class + using Xamax::DoAmax; + + // Constructor + Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"): + Xamax(queue, event, name) { + } + + // Forwards to the regular max-absolute version. The implementation difference is realised in the + // kernel through a pre-processor macro based on the name of the routine. + StatusCode DoMin(const size_t n, + const Buffer &imin_buffer, const size_t imin_offset, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XMIN_H_ +#endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index a9419f13..0fd05053 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -75,6 +75,7 @@ routines = [ Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)"), Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector"), Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)"), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)"), ], [ # Level 2: matrix-vector Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), @@ -298,8 +299,8 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 70, 93, 22, 29, 38] -footer_lines = [17, 70, 19, 14, 6, 6] +header_lines = [84, 71, 93, 22, 29, 38] +footer_lines = [17, 71, 19, 14, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise for f in files: diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 2fd26e79..47790a55 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -73,7 +73,7 @@ class Routine(): # List of scalar buffers def ScalarBuffersFirst(self): - return ["dot","nrm2","asum","sum","imax"] + return ["dot","nrm2","asum","sum","imax","imin"] def ScalarBuffersSecond(self): return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"] diff --git a/src/clblast.cc b/src/clblast.cc index a5bb6b67..4d7c9986 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -32,6 +32,7 @@ #include "internal/routines/level1/xsum.h" // non-BLAS function #include "internal/routines/level1/xamax.h" #include "internal/routines/level1/xmax.h" // non-BLAS function +#include "internal/routines/level1/xmin.h" // non-BLAS function // BLAS level-2 includes #include "internal/routines/level2/xgemv.h" @@ -525,6 +526,37 @@ template StatusCode PUBLIC_API Max(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +template +StatusCode Min(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = Queue(*queue); + auto routine = Xmin(queue_cpp, event); + auto status = routine.SetUp(); + if (status != StatusCode::kSuccess) { return status; } + return routine.DoMin(n, + Buffer(imin_buffer), imin_offset, + Buffer(x_buffer), x_offset, x_inc); +} +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= @@ -1880,6 +1912,7 @@ StatusCode FillCache(const cl_device_id device) { Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xsum(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xamax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); Xmax(queue, nullptr).SetUp(); + Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); Xmin(queue, nullptr).SetUp(); // Runs all the level 2 set-up functions Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); Xgemv(queue, nullptr).SetUp(); diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 47ab1798..1fc63de2 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -601,6 +601,48 @@ StatusCode CLBlastiZmax(const size_t n, return static_cast(status); } +// MIN +StatusCode CLBlastiSmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiDmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiCmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} +StatusCode CLBlastiZmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast(status); +} + // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl index 58b75ce2..48d0eb5c 100644 --- a/src/kernels/level1/xamax.opencl +++ b/src/kernels/level1/xamax.opencl @@ -41,7 +41,7 @@ __kernel void Xamax(const int n, const int num_groups = get_num_groups(0); // Performs loading and the first steps of the reduction - #if defined(ROUTINE_MAX) // non-absolute version + #if defined(ROUTINE_MAX) || defined(ROUTINE_MIN) // non-absolute version singlereal max = SMALLEST; #else singlereal max = ZERO; @@ -55,7 +55,10 @@ __kernel void Xamax(const int n, #else singlereal x = xgm[x_index]; #endif - #if defined(ROUTINE_MAX) // non-absolute version + #if defined(ROUTINE_MAX) // non-absolute maximum version + // nothing special here + #elif defined(ROUTINE_MIN) // non-absolute minimum version + x = -x; #else x = fabs(x); #endif From 9602c150aa3b7f0a392207bef8cbb6048b1da891 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 1 May 2016 12:56:08 +0200 Subject: [PATCH 43/60] Added a program cache (per-context) next to the per-device binary cache --- include/internal/cache.h | 40 ++++++++++-- include/internal/clpp11.h | 4 ++ include/internal/routine.h | 27 ++++---- src/cache.cc | 48 +++++++++++++-- src/routine.cc | 123 +++++++++++++++++++++---------------- 5 files changed, 167 insertions(+), 75 deletions(-) diff --git a/include/internal/cache.h b/include/internal/cache.h index fa33b78f..4a11b70f 100644 --- a/include/internal/cache.h +++ b/include/internal/cache.h @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file implements the caching functionality of compiled binaries. +// This file implements the caching functionality of compiled binaries and programs. // // ================================================================================================= @@ -46,18 +46,46 @@ static std::mutex binary_cache_mutex_; // ================================================================================================= -// Stores the compiled binary in the cache -void StoreBinaryToCache(const std::string& binary, const std::string &device_name, - const Precision &precision, const std::string &routine_name); +// The cache of compiled OpenCL programs, along with some meta-data +struct ProgramCache { + Program program; + ContextPointer context_ptr; + Precision precision; + std::string routine_name_; -// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws -// otherwise. + // Finds out whether the properties match + bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision, + const std::string &ref_routine) { + return (context_ptr == ref_context && + precision == ref_precision && + routine_name_ == ref_routine); + } +}; + +// The actual cache, implemented as a vector of the above data-type, and its mutex +static std::vector program_cache_; +static std::mutex program_cache_mutex_; + +// ================================================================================================= + +// Stores the compiled binary or program in the cache +void StoreBinaryToCache(const std::string &binary, const std::string &device_name, + const Precision &precision, const std::string &routine_name); +void StoreProgramToCache(const Program &program, const Context &context, + const Precision &precision, const std::string &routine_name); + +// Queries the cache and retrieves a matching binary or program. Assumes that the match is +// available, throws otherwise. const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, const std::string &routine_name); +const Program& GetProgramFromCache(const Context &context, const Precision &precision, + const std::string &routine_name); // Queries the cache to see whether or not the compiled kernel is already there bool BinaryIsInCache(const std::string &device_name, const Precision &precision, const std::string &routine_name); +bool ProgramIsInCache(const Context &context, const Precision &precision, + const std::string &routine_name); // ================================================================================================= diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index b865ab1e..e70f9000 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -269,10 +269,14 @@ class Context { // Accessor to the private data-member const cl_context& operator()() const { return *context_; } + cl_context* pointer() const { return &(*context_); } private: std::shared_ptr context_; }; +// Pointer to an OpenCL context +using ContextPointer = cl_context*; + // ================================================================================================= // Enumeration of build statuses of the run-time compilation process diff --git a/include/internal/routine.h b/include/internal/routine.h index 32be6012..f2f236ac 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -84,23 +84,28 @@ class Routine { const bool upper = false, const bool lower = false, const bool diagonal_imag_zero = false); - // Stores a newly compiled binary into the cache + // Stores a newly compiled binary/program into the cache void StoreBinaryToCache(const std::string& binary) const { - return cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_); + cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_); + } + void StoreProgramToCache(const Program& program) const { + cache::StoreProgramToCache(program, context_, precision_, routine_name_); } - // Queries the cache and retrieve either a matching program or a boolean whether a match exists. - // The first assumes that the program is available in the cache and will throw an exception - // otherwise. + // Queries the cache and retrieve either a matching binary/program or a boolean whether a match + // exists. The first assumes that the binary/program is available in the cache and will throw an + // exception otherwise. + std::string GetBinaryFromCache() const { + return cache::GetBinaryFromCache(device_name_, precision_, routine_name_); + } Program GetProgramFromCache() const { - auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_); - auto program = Program(device_, context_, binary); - auto options = std::vector(); - program.Build(device_, options); - return program; + return cache::GetProgramFromCache(context_, precision_, routine_name_); + } + bool BinaryIsInCache() const { + return cache::BinaryIsInCache(device_name_, precision_, routine_name_); } bool ProgramIsInCache() const { - return cache::BinaryIsInCache(device_name_, precision_, routine_name_); + return cache::ProgramIsInCache(context_, precision_, routine_name_); } // Non-static variable for the precision. Note that the same variable (but static) might exist in diff --git a/src/cache.cc b/src/cache.cc index 18731a51..4dbdb711 100644 --- a/src/cache.cc +++ b/src/cache.cc @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file implements the caching functionality of compiled binaries. +// This file implements the caching functionality of compiled binaries and programs. // // ================================================================================================= @@ -22,13 +22,21 @@ namespace cache { // ================================================================================================= // Stores the compiled binary or IR in the cache -void StoreBinaryToCache(const std::string& binary, const std::string &device_name, +void StoreBinaryToCache(const std::string &binary, const std::string &device_name, const Precision &precision, const std::string &routine_name) { binary_cache_mutex_.lock(); - binary_cache_.push_back({binary, device_name, precision, routine_name}); + binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name}); binary_cache_mutex_.unlock(); } +// Stores the compiled program in the cache +void StoreProgramToCache(const Program &program, const Context &context, + const Precision &precision, const std::string &routine_name) { + program_cache_mutex_.lock(); + program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name}); + program_cache_mutex_.unlock(); +} + // Queries the cache and retrieves a matching binary. Assumes that the match is available, throws // otherwise. const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, @@ -44,6 +52,21 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none."); } +// Queries the cache and retrieves a matching program. Assumes that the match is available, throws +// otherwise. +const Program& GetProgramFromCache(const Context &context, const Precision &precision, + const std::string &routine_name) { + program_cache_mutex_.lock(); + for (auto &cached_program: program_cache_) { + if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) { + program_cache_mutex_.unlock(); + return cached_program.program; + } + } + program_cache_mutex_.unlock(); + throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); +} + // Queries the cache to see whether or not the compiled kernel is already there bool BinaryIsInCache(const std::string &device_name, const Precision &precision, const std::string &routine_name) { @@ -58,13 +81,30 @@ bool BinaryIsInCache(const std::string &device_name, const Precision &precision, return false; } +// Queries the cache to see whether or not the compiled kernel is already there +bool ProgramIsInCache(const Context &context, const Precision &precision, + const std::string &routine_name) { + program_cache_mutex_.lock(); + for (auto &cached_program: program_cache_) { + if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) { + program_cache_mutex_.unlock(); + return true; + } + } + program_cache_mutex_.unlock(); + return false; +} + // ================================================================================================= -// Clears the cache of stored binaries +// Clears the cache of stored binaries and programs StatusCode ClearCache() { binary_cache_mutex_.lock(); binary_cache_.clear(); binary_cache_mutex_.unlock(); + program_cache_mutex_.lock(); + program_cache_.clear(); + program_cache_mutex_.unlock(); return StatusCode::kSuccess; } diff --git a/src/routine.cc b/src/routine.cc index cd4d82fb..35d0653c 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -42,66 +42,81 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name, template StatusCode Routine::SetUp() { - // Queries the cache to see whether or not the compiled kernel is already there. If not, it will - // be built and added to the cache. - if (!ProgramIsInCache()) { + // Queries the cache to see whether or not the program (context-specific) is already there + if (ProgramIsInCache()) { return StatusCode::kSuccess; } - // Inspects whether or not cl_khr_fp64 is supported in case of double precision - auto extensions = device_.Capabilities(); - if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { - if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { - return StatusCode::kNoDoublePrecision; - } - } - - // As above, but for cl_khr_fp16 (half precision) - if (precision_ == Precision::kHalf) { - if (extensions.find(kKhronosHalfPrecision) == std::string::npos) { - return StatusCode::kNoHalfPrecision; - } - } - - // Loads the common header (typedefs and defines and such) - std::string common_header = - #include "kernels/common.opencl" - ; - - // Collects the parameters for this device in the form of defines, and adds the precision - auto defines = db_.GetDefines(); - defines += "#define PRECISION "+ToString(static_cast(precision_))+"\n"; - - // Adds the name of the routine as a define - defines += "#define ROUTINE_"+routine_name_+"\n"; - - // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve - // performance, but might result in a reduced accuracy. - if (device_.Vendor() == "AMD") { - defines += "#define USE_CL_MAD 1\n"; - } - - // Combines everything together into a single source string - auto source_string = defines + common_header + source_string_; - - // Compiles the kernel + // Queries the cache to see whether or not the binary (device-specific) is already there. If it + // is, a program is created and stored in the cache + if (BinaryIsInCache()) { try { - auto program = Program(context_, source_string); + auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_); + auto program = Program(device_, context_, binary); auto options = std::vector(); - auto build_status = program.Build(device_, options); - - // Checks for compiler crashes/errors/warnings - if (build_status == BuildStatus::kError) { - auto message = program.GetBuildInfo(device_); - fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); - return StatusCode::kBuildProgramFailure; - } - if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } - - // Store the compiled kernel in the cache - auto binary = program.GetIR(); - StoreBinaryToCache(binary); + program.Build(device_, options); + StoreProgramToCache(program); } catch (...) { return StatusCode::kBuildProgramFailure; } + return StatusCode::kSuccess; } + // Otherwise, the kernel will be compiled and program will be built. Both the binary and the + // program will be added to the cache. + + // Inspects whether or not cl_khr_fp64 is supported in case of double precision + auto extensions = device_.Capabilities(); + if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { + if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { + return StatusCode::kNoDoublePrecision; + } + } + + // As above, but for cl_khr_fp16 (half precision) + if (precision_ == Precision::kHalf) { + if (extensions.find(kKhronosHalfPrecision) == std::string::npos) { + return StatusCode::kNoHalfPrecision; + } + } + + // Loads the common header (typedefs and defines and such) + std::string common_header = + #include "kernels/common.opencl" + ; + + // Collects the parameters for this device in the form of defines, and adds the precision + auto defines = db_.GetDefines(); + defines += "#define PRECISION "+ToString(static_cast(precision_))+"\n"; + + // Adds the name of the routine as a define + defines += "#define ROUTINE_"+routine_name_+"\n"; + + // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve + // performance, but might result in a reduced accuracy. + if (device_.Vendor() == "AMD") { + defines += "#define USE_CL_MAD 1\n"; + } + + // Combines everything together into a single source string + auto source_string = defines + common_header + source_string_; + + // Compiles the kernel + try { + auto program = Program(context_, source_string); + auto options = std::vector(); + auto build_status = program.Build(device_, options); + + // Checks for compiler crashes/errors/warnings + if (build_status == BuildStatus::kError) { + auto message = program.GetBuildInfo(device_); + fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); + return StatusCode::kBuildProgramFailure; + } + if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } + + // Store the compiled binary and program in the cache + const auto binary = program.GetIR(); + StoreBinaryToCache(binary); + StoreProgramToCache(program); + } catch (...) { return StatusCode::kBuildProgramFailure; } + // No errors, normal termination of this function return StatusCode::kSuccess; } From bee2f943ec51a3482e89cf635a2d12b6b6d96b04 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 1 May 2016 14:03:37 +0200 Subject: [PATCH 44/60] Changed the index buffer of IxAMAX routines to unsigned int for proper buffersize checking --- include/internal/routine.h | 6 +++++- include/internal/routines/level1/xamax.h | 4 ++-- include/internal/routines/level1/xmax.h | 2 +- include/internal/routines/level1/xmin.h | 2 +- scripts/generator/routine.py | 7 ++++++- src/clblast.cc | 6 +++--- src/routine.cc | 13 +++++++++++++ src/routines/level1/xamax.cc | 4 ++-- 8 files changed, 33 insertions(+), 11 deletions(-) diff --git a/include/internal/routine.h b/include/internal/routine.h index f2f236ac..6df186c5 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -62,13 +62,17 @@ class Routine { StatusCode TestMatrixAP(const size_t n, const Buffer &buffer, const size_t offset, const size_t data_size); - // Tests for valid inputs of vectors X and Y + // Tests for valid inputs of vector X and Y StatusCode TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc, const size_t data_size); StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc, const size_t data_size); + + // Tests for valid inputs of other vectors StatusCode TestVectorDot(const size_t n, const Buffer &buffer, const size_t offset, const size_t data_size); + StatusCode TestVectorIndex(const size_t n, const Buffer &buffer, + const size_t offset, const size_t data_size); // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write // to symmetric and triangular matrices through optional arguments. diff --git a/include/internal/routines/level1/xamax.h b/include/internal/routines/level1/xamax.h index b815e8d2..c318115e 100644 --- a/include/internal/routines/level1/xamax.h +++ b/include/internal/routines/level1/xamax.h @@ -32,7 +32,7 @@ class Xamax: public Routine { using Routine::context_; using Routine::GetProgramFromCache; using Routine::TestVectorX; - using Routine::TestVectorDot; + using Routine::TestVectorIndex; using Routine::RunKernel; using Routine::ErrorIn; @@ -41,7 +41,7 @@ class Xamax: public Routine { // Templated-precision implementation of the routine StatusCode DoAmax(const size_t n, - const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &imax_buffer, const size_t imax_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); private: diff --git a/include/internal/routines/level1/xmax.h b/include/internal/routines/level1/xmax.h index 860a043b..a872cede 100644 --- a/include/internal/routines/level1/xmax.h +++ b/include/internal/routines/level1/xmax.h @@ -36,7 +36,7 @@ class Xmax: public Xamax { // Forwards to the regular absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. StatusCode DoMax(const size_t n, - const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &imax_buffer, const size_t imax_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc); } diff --git a/include/internal/routines/level1/xmin.h b/include/internal/routines/level1/xmin.h index 4c99a5ad..700c81cc 100644 --- a/include/internal/routines/level1/xmin.h +++ b/include/internal/routines/level1/xmin.h @@ -36,7 +36,7 @@ class Xmin: public Xamax { // Forwards to the regular max-absolute version. The implementation difference is realised in the // kernel through a pre-processor macro based on the name of the routine. StatusCode DoMin(const size_t n, - const Buffer &imin_buffer, const size_t imin_offset, + const Buffer &imin_buffer, const size_t imin_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc); } diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 47790a55..95681da6 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -81,6 +81,10 @@ class Routine(): def OtherScalars(self): return ["cos","sin"] + # List of buffers with unsigned int type + def IndexBuffers(self): + return ["imax","imin"] + # List of buffers without 'inc' or 'ld' def BuffersWithoutLdInc(self): return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"] @@ -145,7 +149,8 @@ class Routine(): # As above but with Claduc buffers def BufferCladuc(self, name): if (name in self.inputs) or (name in self.outputs): - a = ["Buffer<"+self.template.buffertype+">("+name+"_buffer)"] + buffertype = "unsigned int" if (name in self.IndexBuffers()) else self.template.buffertype + a = ["Buffer<"+buffertype+">("+name+"_buffer)"] b = [name+"_offset"] c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] return [", ".join(a+b+c)] diff --git a/src/clblast.cc b/src/clblast.cc index 4d7c9986..8a9465c3 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -475,7 +475,7 @@ StatusCode Amax(const size_t n, auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoAmax(n, - Buffer(imax_buffer), imax_offset, + Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Amax(const size_t, @@ -506,7 +506,7 @@ StatusCode Max(const size_t n, auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoMax(n, - Buffer(imax_buffer), imax_offset, + Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Max(const size_t, @@ -537,7 +537,7 @@ StatusCode Min(const size_t n, auto status = routine.SetUp(); if (status != StatusCode::kSuccess) { return status; } return routine.DoMin(n, - Buffer(imin_buffer), imin_offset, + Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); } template StatusCode PUBLIC_API Min(const size_t, diff --git a/src/routine.cc b/src/routine.cc index 35d0653c..8fa4cc79 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -267,6 +267,19 @@ StatusCode Routine::TestVectorDot(const size_t n, const Buffer &buffer, co return StatusCode::kSuccess; } +// Tests vector index for validity: checks for a valid increment, a valid OpenCL buffer, and for a +// sufficient buffer size. +template +StatusCode Routine::TestVectorIndex(const size_t n, const Buffer &buffer, + const size_t offset, const size_t data_size) { + try { + auto required_size = (n + offset)*data_size; + auto buffer_size = buffer.GetSize(); + if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; } + } catch (...) { return StatusCode::kInvalidVectorDot; } + return StatusCode::kSuccess; +} + // ================================================================================================= // Copies or transposes a matrix and pads/unpads it with zeros diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc index 33bd72a6..682e2b63 100644 --- a/src/routines/level1/xamax.cc +++ b/src/routines/level1/xamax.cc @@ -41,7 +41,7 @@ Xamax::Xamax(Queue &queue, EventPointer event, const std::string &name): // The main routine template StatusCode Xamax::DoAmax(const size_t n, - const Buffer &imax_buffer, const size_t imax_offset, + const Buffer &imax_buffer, const size_t imax_offset, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { // Makes sure all dimensions are larger than zero @@ -50,7 +50,7 @@ StatusCode Xamax::DoAmax(const size_t n, // Tests the vectors for validity auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T)); if (ErrorIn(status)) { return status; } - status = TestVectorDot(1, imax_buffer, imax_offset, sizeof(T)); + status = TestVectorIndex(1, imax_buffer, imax_offset, sizeof(unsigned int)); if (ErrorIn(status)) { return status; } // Retrieves the Xamax kernels from the compiled binary From b9317d7d0c5ac7da9b6377183c9d7640defd6046 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 1 May 2016 14:39:44 +0200 Subject: [PATCH 45/60] Made the default xDOT tuning size smaller --- src/tuning/xdot.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tuning/xdot.cc b/src/tuning/xdot.cc index 48fa800b..cff656c3 100644 --- a/src/tuning/xdot.cc +++ b/src/tuning/xdot.cc @@ -44,7 +44,7 @@ class TuneXdot { // Sets the default values for the arguments static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 64*1024*1024; } + static size_t DefaultN() { return 2*1024*1024; } static size_t DefaultK() { return 1; } // N/A for this kernel static double DefaultFraction() { return 1.0; } // N/A for this kernel From c94b628318d3f074a684a0a0df5128343179bed0 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 1 May 2016 19:17:04 +0200 Subject: [PATCH 46/60] Updated tuning database for reduction/dot kernels based on the new tuner; partially repopulated the database --- include/internal/database/xdot.h | 174 +++++++++---------------------- scripts/database/database.py | 3 + 2 files changed, 55 insertions(+), 122 deletions(-) diff --git a/include/internal/database/xdot.h b/include/internal/database/xdot.h index b741e317..231eaf84 100644 --- a/include/internal/database/xdot.h +++ b/include/internal/database/xdot.h @@ -18,54 +18,36 @@ const Database::DatabaseEntry Database::XdotSingle = { "Xdot", Precision::kSingle, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } }, - { "Tahiti", { {"VW",1}, {"WGS1",256}, {"WGS2",256} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"VW",1}, {"WGS1",128}, {"WGS2",256} } }, - { "default", { {"VW",1}, {"WGS1",128}, {"WGS2",256} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } }, + { "Tahiti", { {"WGS1",128}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, + { "default", { {"WGS1",1024}, {"WGS2",32} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Iris", { {"VW",1}, {"WGS1",512}, {"WGS2",32} } }, - { "Iris Pro", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } }, - { "default", { {"VW",1}, {"WGS1",128}, {"WGS2",32} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, + { "Iris Pro", { {"WGS1",512}, {"WGS2",64} } }, + { "default", { {"WGS1",512}, {"WGS2",64} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { - { "GeForce GTX 480", { {"VW",1}, {"WGS1",256}, {"WGS2",128} } }, - { "GeForce GTX 680", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "GeForce GTX 980", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } }, - { "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } }, - { "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } }, + { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } }, + { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } }, + { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, + { "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, } @@ -77,54 +59,36 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { "Xdot", Precision::kComplexSingle, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } }, - { "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } }, - { "default", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, + { "Tahiti", { {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, + { "default", { {"WGS1",1024}, {"WGS2",32} } }, } }, { // Intel GPUs kDeviceTypeGPU, "Intel", { - { "Iris", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } }, - { "Iris Pro", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, + { "Iris Pro", { {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",32}, {"WGS2",32} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { - { "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } }, - { "GeForce GTX 680", { {"VW",1}, {"WGS1",256}, {"WGS2",32} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",128}, {"WGS2",32} } }, - { "GeForce GTX 980", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } }, - { "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } }, - { "Tesla K20m", { {"VW",1}, {"WGS1",256}, {"WGS2",512} } }, - { "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } }, + { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, + { "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } }, + { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, + { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } }, + { "default", { {"WGS1",32}, {"WGS2",32} } }, } }, } @@ -136,47 +100,30 @@ const Database::DatabaseEntry Database::XdotDouble = { "Xdot", Precision::kDouble, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } }, - { "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } }, + { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",1024}, {"WGS2",512} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } }, + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",64} } }, + { "default", { {"WGS1",512}, {"WGS2",64} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { - { "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } }, - { "GeForce GTX 680", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "GeForce GTX 980", { {"VW",1}, {"WGS1",32}, {"WGS2",512} } }, - { "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } }, - { "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } }, - { "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } }, - { "Tesla K40m", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } }, + { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } }, + { "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } }, + { "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } }, + { "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } }, + { "default", { {"WGS1",128}, {"WGS2",32} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, } }, } @@ -188,47 +135,30 @@ const Database::DatabaseEntry Database::XdotComplexDouble = { "Xdot", Precision::kComplexDouble, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } }, - { "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } }, - } - }, - { // ARM GPUs - kDeviceTypeGPU, "ARM", { - { "Mali-T628", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } }, - { "default", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, + { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, } }, { // Intel CPUs kDeviceTypeCPU, "Intel", { - { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } }, - { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } }, - } - }, - { // Intel accelerators - kDeviceTypeAccelerator, "Intel", { - { "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } }, + { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } }, + { "default", { {"WGS1",1024}, {"WGS2",32} } }, } }, { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { - { "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } }, - { "GeForce GTX 680", { {"VW",1}, {"WGS1",256}, {"WGS2",64} } }, - { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } }, - { "GeForce GTX 980", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } }, - { "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } }, - { "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } }, - { "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } }, - { "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } }, - { "default", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } }, + { "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } }, + { "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } }, + { "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } }, + { "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } }, + { "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, } }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW",1}, {"WGS1",32}, {"WGS2",32} } }, + { "default", { {"WGS1",64}, {"WGS2",32} } }, } }, } diff --git a/scripts/database/database.py b/scripts/database/database.py index 7fd8c4d8..67c5669b 100644 --- a/scripts/database/database.py +++ b/scripts/database/database.py @@ -95,6 +95,9 @@ def RemoveDuplicates(df): def RemoveEntriesByDevice(df, devicename): return df[df["device"] != devicename] +def RemoveEntriesByKernelFamily(df, familyname): + return df[df["kernel_family"] != familyname] + def GetEntriesByField(df, field, value): return df[df[field] == value] From 27d0ac7f388d8d8f06eb704ee8173ffb55ac25a1 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 1 May 2016 19:33:50 +0200 Subject: [PATCH 47/60] Added tuning results for AMD Pitcairn (R9 270X) --- include/internal/database/copy.h | 8 ++++++-- include/internal/database/pad.h | 10 +++++++--- include/internal/database/padtranspose.h | 8 ++++++-- include/internal/database/transpose.h | 8 ++++++-- include/internal/database/xaxpy.h | 4 ++++ include/internal/database/xdot.h | 4 ++++ include/internal/database/xgemm.h | 8 ++++++-- include/internal/database/xgemv.h | 6 +++++- include/internal/database/xger.h | 10 +++++++--- scripts/database/database.py | 2 ++ 10 files changed, 53 insertions(+), 15 deletions(-) diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h index 42c9c21c..9787ec18 100644 --- a/include/internal/database/copy.h +++ b/include/internal/database/copy.h @@ -19,8 +19,9 @@ const Database::DatabaseEntry Database::CopySingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, - { "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, } }, { // ARM GPUs @@ -78,6 +79,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } @@ -129,6 +131,7 @@ const Database::DatabaseEntry Database::CopyDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } @@ -181,8 +184,9 @@ const Database::DatabaseEntry Database::CopyComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, - { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, } }, { // ARM GPUs diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h index 8e8ae966..c9bbe8fa 100644 --- a/include/internal/database/pad.h +++ b/include/internal/database/pad.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::PadSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } @@ -78,8 +79,9 @@ const Database::DatabaseEntry Database::PadComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // ARM GPUs @@ -124,7 +126,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, } @@ -137,8 +139,9 @@ const Database::DatabaseEntry Database::PadDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, - { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } }, { // ARM GPUs @@ -189,6 +192,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, } diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h index 6e210e84..08396086 100644 --- a/include/internal/database/padtranspose.h +++ b/include/internal/database/padtranspose.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, } @@ -78,8 +79,9 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, } }, { // ARM GPUs @@ -137,8 +139,9 @@ const Database::DatabaseEntry Database::PadtransposeDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, - { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, + { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, } }, { // ARM GPUs @@ -189,6 +192,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, } diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h index 005a6921..de826857 100644 --- a/include/internal/database/transpose.h +++ b/include/internal/database/transpose.h @@ -19,8 +19,9 @@ const Database::DatabaseEntry Database::TransposeSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, + { "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, } }, { // ARM GPUs @@ -78,6 +79,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, } @@ -131,8 +133,9 @@ const Database::DatabaseEntry Database::TransposeDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, } }, { // ARM GPUs @@ -183,6 +186,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, } diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h index 71a4c7f2..09706fb0 100644 --- a/include/internal/database/xaxpy.h +++ b/include/internal/database/xaxpy.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::XaxpySingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, } @@ -78,6 +79,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } }, + { "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, } @@ -137,6 +139,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, } @@ -189,6 +192,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, } diff --git a/include/internal/database/xdot.h b/include/internal/database/xdot.h index 231eaf84..d02bc03b 100644 --- a/include/internal/database/xdot.h +++ b/include/internal/database/xdot.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::XdotSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } }, + { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",128}, {"WGS2",32} } }, { "default", { {"WGS1",128}, {"WGS2",32} } }, } @@ -60,6 +61,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, + { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",32} } }, { "default", { {"WGS1",64}, {"WGS2",32} } }, } @@ -101,6 +103,7 @@ const Database::DatabaseEntry Database::XdotDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } }, + { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, { "default", { {"WGS1",64}, {"WGS2",32} } }, } @@ -136,6 +139,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, + { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, { "default", { {"WGS1",64}, {"WGS2",32} } }, } diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h index c9fe03e4..7f005cf1 100644 --- a/include/internal/database/xgemm.h +++ b/include/internal/database/xgemm.h @@ -19,8 +19,9 @@ const Database::DatabaseEntry Database::XgemmSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, + { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // ARM GPUs @@ -78,6 +79,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, } @@ -137,8 +139,9 @@ const Database::DatabaseEntry Database::XgemmDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, + { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, } }, { // ARM GPUs @@ -189,6 +192,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } diff --git a/include/internal/database/xgemv.h b/include/internal/database/xgemv.h index c3ce3b20..14e428d9 100644 --- a/include/internal/database/xgemv.h +++ b/include/internal/database/xgemv.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::XgemvSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, } @@ -71,8 +72,9 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Pitcairn", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, - { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, } }, { // Intel CPUs @@ -119,6 +121,7 @@ const Database::DatabaseEntry Database::XgemvDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, } @@ -164,6 +167,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, } diff --git a/include/internal/database/xger.h b/include/internal/database/xger.h index d57e606f..03c7ce41 100644 --- a/include/internal/database/xger.h +++ b/include/internal/database/xger.h @@ -19,8 +19,9 @@ const Database::DatabaseEntry Database::XgerSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, + { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, - { "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, } }, { // ARM GPUs @@ -65,6 +66,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, + { "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, } @@ -111,8 +113,9 @@ const Database::DatabaseEntry Database::XgerDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, + { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, - { "default", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, } }, { // ARM GPUs @@ -138,7 +141,7 @@ const Database::DatabaseEntry Database::XgerDouble = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } }, } }, } @@ -151,6 +154,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, { "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, } diff --git a/scripts/database/database.py b/scripts/database/database.py index 67c5669b..d14e36cc 100644 --- a/scripts/database/database.py +++ b/scripts/database/database.py @@ -101,6 +101,8 @@ def RemoveEntriesByKernelFamily(df, familyname): def GetEntriesByField(df, field, value): return df[df[field] == value] +# Example usage: +# df = UpdateDatabase(df, (df["kernel_family"] == "xdot") & (df["arg_n"] == "67108864"), "arg_n", "2097152") def UpdateDatabase(df, condition, field, value): df.loc[condition, field] = value return df From a8f109296c418240e4133f3a32b53ce6017683f2 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 2 May 2016 20:04:55 +0200 Subject: [PATCH 48/60] Fixed the calculation of the required buffer sizes in case of subvectors and submatrices --- src/routine.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/routine.cc b/src/routine.cc index 8fa4cc79..e0cc9a90 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -175,7 +175,7 @@ StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buf const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimA; } try { - auto required_size = (ld*two + offset)*data_size; + auto required_size = (ld*(two-1) + one + offset)*data_size; auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; } } catch (...) { return StatusCode::kInvalidMatrixA; } @@ -189,7 +189,7 @@ StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buf const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimB; } try { - auto required_size = (ld*two + offset)*data_size; + auto required_size = (ld*(two-1) + one + offset)*data_size; auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; } } catch (...) { return StatusCode::kInvalidMatrixB; } @@ -203,7 +203,7 @@ StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buf const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimC; } try { - auto required_size = (ld*two + offset)*data_size; + auto required_size = (ld*(two-1) + one + offset)*data_size; auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; } } catch (...) { return StatusCode::kInvalidMatrixC; } @@ -231,7 +231,7 @@ StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, cons const size_t inc, const size_t data_size) { if (inc == 0) { return StatusCode::kInvalidIncrementX; } try { - auto required_size = (n*inc + offset)*data_size; + auto required_size = ((n-1)*inc + 1 + offset)*data_size; auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; } } catch (...) { return StatusCode::kInvalidVectorX; } @@ -245,7 +245,7 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, cons const size_t inc, const size_t data_size) { if (inc == 0) { return StatusCode::kInvalidIncrementY; } try { - auto required_size = (n*inc + offset)*data_size; + auto required_size = ((n-1)*inc + 1 + offset)*data_size; auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; } } catch (...) { return StatusCode::kInvalidVectorY; } From 435729a43ecb3eacb3347ad7cf37f8479680d423 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 2 May 2016 20:20:23 +0200 Subject: [PATCH 49/60] Added tuning results for AMD Hawaii (R9 290X) --- CHANGELOG | 1 + README.md | 2 ++ include/internal/database/copy.h | 4 ++++ include/internal/database/pad.h | 4 ++++ include/internal/database/padtranspose.h | 4 ++++ include/internal/database/transpose.h | 6 +++++- include/internal/database/xaxpy.h | 6 +++++- include/internal/database/xdot.h | 4 ++++ include/internal/database/xgemm.h | 10 +++++++--- include/internal/database/xgemv.h | 4 ++++ include/internal/database/xger.h | 6 +++++- 11 files changed, 45 insertions(+), 6 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index f68c2483..dcfbaaae 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,7 @@ Development version (next release) - Changed the enum parameters to match the raw values of the cblas standard - Fixed the cache of previously compiled binaries and added a function to fill or clear it - Added additional sample programs +- Added tuned parameters for various devices (see README) - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 * SASUM/DASUM/ScASUM/DzASUM diff --git a/README.md b/README.md index 0f7b7d3c..97b9ed0f 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,8 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC - Tesla K40m * AMD GPUs: - Tahiti + - Hawaii + - Pitcairn - R9 M370X * Intel GPUs: - Iris diff --git a/include/internal/database/copy.h b/include/internal/database/copy.h index 9787ec18..59a9e03a 100644 --- a/include/internal/database/copy.h +++ b/include/internal/database/copy.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::CopySingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } }, + { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, @@ -79,6 +80,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, @@ -131,6 +133,7 @@ const Database::DatabaseEntry Database::CopyDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, @@ -184,6 +187,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, + { "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } }, { "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } }, diff --git a/include/internal/database/pad.h b/include/internal/database/pad.h index c9bbe8fa..d2de19e4 100644 --- a/include/internal/database/pad.h +++ b/include/internal/database/pad.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::PadSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } }, { "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, @@ -79,6 +80,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, @@ -139,6 +141,7 @@ const Database::DatabaseEntry Database::PadDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } }, { "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, @@ -192,6 +195,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, + { "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, { "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } }, diff --git a/include/internal/database/padtranspose.h b/include/internal/database/padtranspose.h index 08396086..b1db1b21 100644 --- a/include/internal/database/padtranspose.h +++ b/include/internal/database/padtranspose.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, @@ -79,6 +80,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, @@ -139,6 +141,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } }, + { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, @@ -192,6 +195,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, + { "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } }, { "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } }, diff --git a/include/internal/database/transpose.h b/include/internal/database/transpose.h index de826857..d87f79a6 100644 --- a/include/internal/database/transpose.h +++ b/include/internal/database/transpose.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::TransposeSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, + { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } }, { "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, @@ -79,6 +80,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, @@ -133,9 +135,10 @@ const Database::DatabaseEntry Database::TransposeDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, + { "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } }, - { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, } }, { // ARM GPUs @@ -186,6 +189,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, + { "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } }, { "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, { "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } }, diff --git a/include/internal/database/xaxpy.h b/include/internal/database/xaxpy.h index 09706fb0..55be0bcb 100644 --- a/include/internal/database/xaxpy.h +++ b/include/internal/database/xaxpy.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::XaxpySingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } }, { "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, @@ -79,6 +80,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } }, + { "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } }, { "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, @@ -139,6 +141,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } }, + { "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } }, { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, @@ -192,9 +195,10 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, { "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, - { "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, + { "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, } }, { // ARM GPUs diff --git a/include/internal/database/xdot.h b/include/internal/database/xdot.h index d02bc03b..f9ae975b 100644 --- a/include/internal/database/xdot.h +++ b/include/internal/database/xdot.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::XdotSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } }, + { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",128}, {"WGS2",32} } }, { "default", { {"WGS1",128}, {"WGS2",32} } }, @@ -61,6 +62,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, + { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",32} } }, { "default", { {"WGS1",64}, {"WGS2",32} } }, @@ -103,6 +105,7 @@ const Database::DatabaseEntry Database::XdotDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } }, + { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, { "default", { {"WGS1",64}, {"WGS2",32} } }, @@ -139,6 +142,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } }, + { "Hawaii", { {"WGS1",256}, {"WGS2",32} } }, { "Pitcairn", { {"WGS1",256}, {"WGS2",32} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",32} } }, { "default", { {"WGS1",64}, {"WGS2",32} } }, diff --git a/include/internal/database/xgemm.h b/include/internal/database/xgemm.h index 7f005cf1..e24adb19 100644 --- a/include/internal/database/xgemm.h +++ b/include/internal/database/xgemm.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::XgemmSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } }, + { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } }, { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, @@ -79,9 +80,10 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } }, { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // ARM GPUs @@ -139,9 +141,10 @@ const Database::DatabaseEntry Database::XgemmDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } }, - { "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, } }, { // ARM GPUs @@ -192,9 +195,10 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } }, + { "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, { "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, - { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, + { "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } }, } }, { // ARM GPUs diff --git a/include/internal/database/xgemv.h b/include/internal/database/xgemv.h index 14e428d9..bbbe62f6 100644 --- a/include/internal/database/xgemv.h +++ b/include/internal/database/xgemv.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::XgemvSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, + { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, @@ -72,6 +73,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Pitcairn", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, { "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, @@ -121,6 +123,7 @@ const Database::DatabaseEntry Database::XgemvDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } }, { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, @@ -167,6 +170,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } }, + { "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, { "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } }, diff --git a/include/internal/database/xger.h b/include/internal/database/xger.h index 03c7ce41..dae857cd 100644 --- a/include/internal/database/xger.h +++ b/include/internal/database/xger.h @@ -19,6 +19,7 @@ const Database::DatabaseEntry Database::XgerSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, + { "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } }, { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, @@ -66,9 +67,10 @@ const Database::DatabaseEntry Database::XgerComplexSingle = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, + { "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, - { "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, + { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, } }, { // ARM GPUs @@ -113,6 +115,7 @@ const Database::DatabaseEntry Database::XgerDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, + { "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, { "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } }, { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, @@ -154,6 +157,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = { { // AMD GPUs kDeviceTypeGPU, "AMD", { { "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } }, + { "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } }, { "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } }, { "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } }, { "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } }, From aa97c836b14b4a449b3bfdf45e62e0588c33bec1 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 4 May 2016 19:16:09 +0200 Subject: [PATCH 50/60] Fixed an issue with linking against the ATLAS BLAS library --- cmake/Modules/FindCBLAS.cmake | 4 ++-- scripts/generator/generator.py | 2 +- test/wrapper_cblas.h | 5 ++++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cmake/Modules/FindCBLAS.cmake b/cmake/Modules/FindCBLAS.cmake index 86f14515..ef3dc4bc 100644 --- a/cmake/Modules/FindCBLAS.cmake +++ b/cmake/Modules/FindCBLAS.cmake @@ -48,11 +48,11 @@ mark_as_advanced(CBLAS_INCLUDE_DIRS) # Finds the library find_library(CBLAS_LIBRARIES - NAMES blas mkl blis openblas atlas accelerate + NAMES blas cblas mkl blis openblas accelerate HINTS ${CBLAS_HINTS} PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import - openblas/lib blis/lib + openblas/lib blis/lib lib/atlas-base PATHS ${CBLAS_PATHS} DOC "Netlib BLAS library" ) diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 0fd05053..75c0a093 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -299,7 +299,7 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 71, 93, 22, 29, 38] +header_lines = [84, 71, 93, 22, 29, 41] footer_lines = [17, 71, 19, 14, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index 994b48b1..566c90e5 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -15,7 +15,10 @@ #ifndef CLBLAST_TEST_WRAPPER_CBLAS_H_ #define CLBLAST_TEST_WRAPPER_CBLAS_H_ -#include +extern "C" +{ + #include +} #include "internal/utilities.h" From e075dc347ad9648c666c81304e2abd17e11753b6 Mon Sep 17 00:00:00 2001 From: Anton Lokhmotov Date: Thu, 5 May 2016 14:38:10 +0000 Subject: [PATCH 51/60] Locate the C BLAS library before the F77 one. --- cmake/Modules/FindCBLAS.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Modules/FindCBLAS.cmake b/cmake/Modules/FindCBLAS.cmake index ef3dc4bc..1439bfcb 100644 --- a/cmake/Modules/FindCBLAS.cmake +++ b/cmake/Modules/FindCBLAS.cmake @@ -48,7 +48,7 @@ mark_as_advanced(CBLAS_INCLUDE_DIRS) # Finds the library find_library(CBLAS_LIBRARIES - NAMES blas cblas mkl blis openblas accelerate + NAMES cblas blas mkl blis openblas accelerate HINTS ${CBLAS_HINTS} PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import From 56aa1701c955546e049ec0dbe5b2777d592b5fc1 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 5 May 2016 23:09:57 +0200 Subject: [PATCH 52/60] Added printing of indices when testing in verbose mode --- test/correctness/testblas.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index a5ccefe0..6bcba267 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -143,7 +143,8 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st if (!TestSimilarity(result1[index], result2[index])) { errors++; if (verbose_) { - fprintf(stdout, "\n Incorrect value found: "); + if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %lu: ", id1); } + else { fprintf(stdout, "\n Error at %lu,%lu: ", id1, id2); } std::cout << result1[index]; fprintf(stdout, " (reference) versus "); std::cout << result2[index]; From 6c9e08c5e288767d9afedb118c37694f63739cae Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 7 May 2016 12:22:06 +0200 Subject: [PATCH 53/60] Added an option to the tests to control whether to test against clBLAS or a CPU BLAS library --- CHANGELOG | 1 + README.md | 13 ++++++---- test/correctness/testblas.cc | 23 ++++++++++-------- test/correctness/testblas.h | 20 ++++++++++----- test/correctness/tester.cc | 47 +++++++++++++++++++++++++++++++----- test/correctness/tester.h | 5 ++++ 6 files changed, 82 insertions(+), 27 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index dcfbaaae..cbe67951 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,7 @@ Development version (next release) - Fixed the use of events within the library - Changed the enum parameters to match the raw values of the cblas standard - Fixed the cache of previously compiled binaries and added a function to fill or clear it +- Various minor fixes and enhancements - Added additional sample programs - Added tuned parameters for various devices (see README) - Added level-1 routines: diff --git a/README.md b/README.md index 97b9ed0f..ae236622 100644 --- a/README.md +++ b/README.md @@ -138,14 +138,14 @@ In summary, tuning the entire library for your device can be done as follows (st make -Compiling the tests (optional) +Compiling the correctness and performance tests (optional) ------------- To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled: cmake -DTESTS=ON .. -Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against. +Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against. If both are present, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library. @@ -249,9 +249,12 @@ Contributing Contributions are welcome in the form of tuning results for OpenCL devices previously untested. Furthermore, merge requests are welcome as long as they contain unit additions or modifications. Furthermore, they should follow the CLBlast coding style, which is based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers. -The contributing authors so far are: +The contributing authors (code, pull requests, testing) so far are: * [Cedric Nugteren](http://www.cedricnugteren.nl) +* [Anton Lokhmotov](https://github.com/psyhtest) +* [Dragan Djuric](https://github.com/blueberry) +* [Hugh Perkins](https://github.com/hughperkins) Tuning and testing on a variety of OpenCL devices was made possible by: @@ -263,11 +266,11 @@ Tuning and testing on a variety of OpenCL devices was made possible by: Support us ------------- -This project started in March 2015 as an evenings and weekends free-time project next to a full-time job. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl). +This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl). To-do list before release of version 1.0 ------------- -- Support all routines supported by clBLAS - Add half-precision routines (e.g. HGEMM) +- Add API documentation diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index 6bcba267..1f83c59b 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -33,17 +33,22 @@ template <> const std::vector TestBlas::kTransposes = template TestBlas::TestBlas(int argc, char *argv[], const bool silent, const std::string &name, const std::vector &options, - const Routine run_routine, const Routine run_reference, + const Routine run_routine, + const Routine run_reference1, const Routine run_reference2, const ResultGet get_result, const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2): Tester(argc, argv, silent, name, options), run_routine_(run_routine), - run_reference_(run_reference), get_result_(get_result), get_index_(get_index), get_id1_(get_id1), get_id2_(get_id2) { + // Sets the reference to test against + if (compare_clblas_) { run_reference_ = run_reference1; } + else if (compare_cblas_) { run_reference_ = run_reference2; } + else { throw std::runtime_error("Invalid configuration: no reference to test against"); } + // Computes the maximum sizes. This allows for a single set of input/output buffers. auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end()); auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end()); @@ -98,14 +103,11 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; auto status2 = run_routine_(args, buffers2, queue_); - #ifndef CLBLAST_REF_CLBLAS - // Don't continue with CBLAS if there are incorrect parameters - if (status2 != StatusCode::kSuccess) { - // TODO: Mark this as a skipped test instead of a succesfull test - TestErrorCodes(status2, status2, args); - continue; - } - #endif + // Don't continue with CBLAS if there are incorrect parameters + if (compare_cblas_ && status2 != StatusCode::kSuccess) { + TestErrorCodes(status2, status2, args); + continue; + } // Runs the reference BLAS code auto x_vec1 = Buffer(context_, args.x_size); @@ -168,6 +170,7 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st template void TestBlas::TestInvalid(std::vector> &test_vector, const std::string &name) { if (!PrecisionSupported(device_)) { return; } + if (!compare_clblas_) { return; } TestStart("invalid buffer sizes", name); // Iterates over all the to-be-tested combinations of arguments diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 8fd1b1e2..4ffc1558 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -37,6 +37,8 @@ class TestBlas: public Tester { using Tester::full_test_; using Tester::verbose_; using Tester::device_; + using Tester::compare_clblas_; + using Tester::compare_cblas_; // Uses several helper functions from the Tester class using Tester::TestStart; @@ -77,7 +79,8 @@ class TestBlas: public Tester { // Constructor, initializes the base class tester and input data TestBlas(int argc, char *argv[], const bool silent, const std::string &name, const std::vector &options, - const Routine run_routine, const Routine run_reference, + const Routine run_routine, + const Routine run_reference1, const Routine run_reference2, const ResultGet get_result, const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2); @@ -113,16 +116,21 @@ template void RunTests(int argc, char *argv[], const bool silent, const std::string &name) { // Sets the reference to test against - #ifdef CLBLAST_REF_CLBLAS - const auto reference_routine = C::RunReference1; // clBLAS when available - #else - const auto reference_routine = C::RunReference2; // otherwise CBLAS + #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS) + const auto reference_routine1 = C::RunReference1; // clBLAS + const auto reference_routine2 = C::RunReference2; // CBLAS + #elif CLBLAST_REF_CLBLAS + const auto reference_routine1 = C::RunReference1; // clBLAS + const auto reference_routine2 = C::RunReference1; // not used, dummy + #elif CLBLAST_REF_CBLAS + const auto reference_routine1 = C::RunReference2; // not used, dummy + const auto reference_routine2 = C::RunReference2; // CBLAS #endif // Creates a tester auto options = C::GetOptions(); TestBlas tester{argc, argv, silent, name, options, - C::RunRoutine, reference_routine, + C::RunRoutine, reference_routine1, reference_routine2, C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2}; // This variable holds the arguments relevant for this routine diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 51d83362..82926c3c 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -43,9 +43,32 @@ Tester::Tester(int argc, char *argv[], const bool silent, tests_failed_{0}, options_{options} { + // Determines which reference to test against + #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS) + compare_clblas_ = GetArgument(argc, argv, help_, kArgCompareclblas, 1); + compare_cblas_ = GetArgument(argc, argv, help_, kArgComparecblas, 0); + #elif CLBLAST_REF_CLBLAS + compare_clblas_ = GetArgument(argc, argv, help_, kArgCompareclblas, 1); + compare_cblas_ = 0; + #elif CLBLAST_REF_CBLAS + compare_clblas_ = 0; + compare_cblas_ = GetArgument(argc, argv, help_, kArgComparecblas, 1); + #else + compare_clblas_ = 0; + compare_cblas_ = 0; + #endif + // Prints the help message (command-line arguments) if (!silent) { fprintf(stdout, "\n* %s\n", help_.c_str()); } + // Can only test against a single reference (not two, not zero) + if (compare_clblas_ && compare_cblas_) { + throw std::runtime_error("Cannot test against both clBLAS and CBLAS references; choose one using the -cblas and -clblas arguments"); + } + if (!compare_clblas_ && !compare_cblas_) { + throw std::runtime_error("Choose one reference (clBLAS or CBLAS) to test against using the -cblas and -clblas arguments"); + } + // Prints the header fprintf(stdout, "* Running on OpenCL device '%s'.\n", device_.Name().c_str()); fprintf(stdout, "* Starting tests for the %s'%s'%s routine.", @@ -68,12 +91,16 @@ Tester::Tester(int argc, char *argv[], const bool silent, kSkippedCompilation.c_str()); fprintf(stdout, " %s -> Test not executed: Unsupported precision\n", kUnsupportedPrecision.c_str()); + fprintf(stdout, " %s -> Test not completed: Reference CBLAS doesn't output error codes\n", + kUnsupportedReference.c_str()); // Initializes clBLAS #ifdef CLBLAST_REF_CLBLAS - auto status = clblasSetup(); - if (status != CL_SUCCESS) { - throw std::runtime_error("clBLAS setup error: "+ToString(static_cast(status))); + if (compare_clblas_) { + auto status = clblasSetup(); + if (status != CL_SUCCESS) { + throw std::runtime_error("clBLAS setup error: "+ToString(static_cast(status))); + } } #endif } @@ -93,7 +120,9 @@ Tester::~Tester() { // Cleans-up clBLAS #ifdef CLBLAST_REF_CLBLAS - clblasTeardown(); + if (compare_clblas_) { + clblasTeardown(); + } #endif } @@ -124,7 +153,7 @@ template void Tester::TestEnd() { fprintf(stdout, "\n"); tests_passed_ += num_passed_; - tests_failed_ += num_skipped_; + tests_skipped_ += num_skipped_; tests_failed_ += num_failed_; // Prints the errors @@ -174,8 +203,14 @@ template void Tester::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status, const Arguments &args) { + // Cannot compare error codes against a library other than clBLAS + if (compare_cblas_) { + PrintTestResult(kUnsupportedReference); + ReportSkipped(); + } + // Finished successfully - if (clblas_status == clblast_status) { + else if (clblas_status == clblast_status) { PrintTestResult(kSuccessStatus); ReportPass(); } diff --git a/test/correctness/tester.h b/test/correctness/tester.h index 3534dffb..46d88caf 100644 --- a/test/correctness/tester.h +++ b/test/correctness/tester.h @@ -58,6 +58,7 @@ class Tester { const std::string kErrorStatus{kPrintError + "/" + kPrintEnd}; const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd}; const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd}; + const std::string kUnsupportedReference{kPrintWarning + "." + kPrintEnd}; // This structure combines the above log-entry with a status code an error percentage struct ErrorLogEntry { @@ -102,6 +103,10 @@ class Tester { // Retrieves the offset values to test with const std::vector GetOffsets() const; + // Testing against reference implementations + int compare_cblas_; + int compare_clblas_; + private: // Internal methods to report a passed, skipped, or failed test From ed2904a34471b10fcfc60dd4034e4a76eb5428cf Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 8 May 2016 09:49:00 +0200 Subject: [PATCH 54/60] Added preliminary generated API documentation --- CHANGELOG | 1 + doc/clblast.md | 2434 ++++++++++++++++++++++++++++++++ scripts/generator/generator.py | 161 ++- scripts/generator/routine.py | 75 +- 4 files changed, 2619 insertions(+), 52 deletions(-) create mode 100644 doc/clblast.md diff --git a/CHANGELOG b/CHANGELOG index cbe67951..51f529d3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,7 @@ Development version (next release) - Changed the enum parameters to match the raw values of the cblas standard - Fixed the cache of previously compiled binaries and added a function to fill or clear it - Various minor fixes and enhancements +- Added a preliminary version of the API documentation - Added additional sample programs - Added tuned parameters for various devices (see README) - Added level-1 routines: diff --git a/doc/clblast.md b/doc/clblast.md new file mode 100644 index 00000000..9c9b9a6f --- /dev/null +++ b/doc/clblast.md @@ -0,0 +1,2434 @@ +CLBlast: API reference +================ + + +xSWAP: Swap two vectors +------------- + +Interchanges the contents of vectors x and y. + +C++ API: +``` +template +StatusCode Swap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SWAP: + +* `const size_t n`: Integer size argument. +* `cl_mem x_buffer`: OpenCL buffer to store the output x vector. +* `const size_t x_offset`: The offset in elements from the start of the output x vector. +* `const size_t x_inc`: Stride/increment of the output x vector. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSCAL: Vector scaling +------------- + +Multiplies all elements of vector x by a scalar constant alpha. + +C++ API: +``` +template +StatusCode Scal(const size_t n, + const T alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSscal(const size_t n, + const float alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDscal(const size_t n, + const double alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCscal(const size_t n, + const cl_float2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZscal(const size_t n, + const cl_double2 alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SCAL: + +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `cl_mem x_buffer`: OpenCL buffer to store the output x vector. +* `const size_t x_offset`: The offset in elements from the start of the output x vector. +* `const size_t x_inc`: Stride/increment of the output x vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xCOPY: Vector copy +------------- + +Copies the contents of vector x into vector y. + +C++ API: +``` +template +StatusCode Copy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastScopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to COPY: + +* `const size_t n`: Integer size argument. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xAXPY: Vector-times-constant plus vector +------------- + +Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant. + +C++ API: +``` +template +StatusCode Axpy(const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSaxpy(const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDaxpy(const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCaxpy(const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZaxpy(const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to AXPY: + +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xDOT: Dot product of two vectors +------------- + +Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer. + +C++ API: +``` +template +StatusCode Dot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to DOT: + +* `const size_t n`: Integer size argument. +* `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector. +* `const size_t dot_offset`: The offset in elements from the start of the output dot vector. +* `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector. +* `const size_t dot_offset`: The offset in elements from the start of the output dot vector. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. +* `const size_t y_offset`: The offset in elements from the start of the input y vector. +* `const size_t y_inc`: Stride/increment of the input y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xDOTU: Dot product of two complex vectors +------------- + +See the regular xDOT routine. + +C++ API: +``` +template +StatusCode Dotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastCdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZdotu(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to DOTU: + +* `const size_t n`: Integer size argument. +* `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector. +* `const size_t dot_offset`: The offset in elements from the start of the output dot vector. +* `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector. +* `const size_t dot_offset`: The offset in elements from the start of the output dot vector. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. +* `const size_t y_offset`: The offset in elements from the start of the input y vector. +* `const size_t y_inc`: Stride/increment of the input y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xDOTC: Dot product of two complex vectors, one conjugated +------------- + +See the regular xDOT routine. + +C++ API: +``` +template +StatusCode Dotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastCdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZdotc(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to DOTC: + +* `const size_t n`: Integer size argument. +* `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector. +* `const size_t dot_offset`: The offset in elements from the start of the output dot vector. +* `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector. +* `const size_t dot_offset`: The offset in elements from the start of the output dot vector. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. +* `const size_t y_offset`: The offset in elements from the start of the input y vector. +* `const size_t y_inc`: Stride/increment of the input y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xNRM2: Euclidian norm of a vector +------------- + +Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer. + +C++ API: +``` +template +StatusCode Nrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastScnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDznrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to NRM2: + +* `const size_t n`: Integer size argument. +* `cl_mem nrm2_buffer`: OpenCL buffer to store the output nrm2 vector. +* `const size_t nrm2_offset`: The offset in elements from the start of the output nrm2 vector. +* `cl_mem nrm2_buffer`: OpenCL buffer to store the output nrm2 vector. +* `const size_t nrm2_offset`: The offset in elements from the start of the output nrm2 vector. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xASUM: Absolute sum of values in a vector +------------- + +Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer. + +C++ API: +``` +template +StatusCode Asum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastScasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDzasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to ASUM: + +* `const size_t n`: Integer size argument. +* `cl_mem asum_buffer`: OpenCL buffer to store the output asum vector. +* `const size_t asum_offset`: The offset in elements from the start of the output asum vector. +* `cl_mem asum_buffer`: OpenCL buffer to store the output asum vector. +* `const size_t asum_offset`: The offset in elements from the start of the output asum vector. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSUM: Sum of values in a vector (non-BLAS function) +------------- + +Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine. + +C++ API: +``` +template +StatusCode Sum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastScsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDzsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SUM: + +* `const size_t n`: Integer size argument. +* `cl_mem sum_buffer`: OpenCL buffer to store the output sum vector. +* `const size_t sum_offset`: The offset in elements from the start of the output sum vector. +* `cl_mem sum_buffer`: OpenCL buffer to store the output sum vector. +* `const size_t sum_offset`: The offset in elements from the start of the output sum vector. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xAMAX: Index of absolute maximum value in a vector +------------- + +Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer. + +C++ API: +``` +template +StatusCode Amax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastiSamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiDamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiCamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiZamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to AMAX: + +* `const size_t n`: Integer size argument. +* `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector. +* `const size_t imax_offset`: The offset in elements from the start of the output imax vector. +* `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector. +* `const size_t imax_offset`: The offset in elements from the start of the output imax vector. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xMAX: Index of maximum value in a vector (non-BLAS function) +------------- + +Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine. + +C++ API: +``` +template +StatusCode Max(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastiSmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiDmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiCmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiZmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to MAX: + +* `const size_t n`: Integer size argument. +* `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector. +* `const size_t imax_offset`: The offset in elements from the start of the output imax vector. +* `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector. +* `const size_t imax_offset`: The offset in elements from the start of the output imax vector. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xMIN: Index of minimum value in a vector (non-BLAS function) +------------- + +Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine. + +C++ API: +``` +template +StatusCode Min(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastiSmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiDmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiCmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiZmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to MIN: + +* `const size_t n`: Integer size argument. +* `cl_mem imin_buffer`: OpenCL buffer to store the output imin vector. +* `const size_t imin_offset`: The offset in elements from the start of the output imin vector. +* `cl_mem imin_buffer`: OpenCL buffer to store the output imin vector. +* `const size_t imin_offset`: The offset in elements from the start of the output imin vector. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xGEMV: General matrix-vector multiplication +------------- + +Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation. + +C++ API: +``` +template +StatusCode Gemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to GEMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const size_t m`: Integer size argument. +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const T beta`: Input scalar constant. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xGBMV: General banded matrix-vector multiplication +------------- + +Same operation as xGEMV, but matrix A is banded instead. + +C++ API: +``` +template +StatusCode Gbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to GBMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const size_t m`: Integer size argument. +* `const size_t n`: Integer size argument. +* `const size_t kl`: Integer size argument. +* `const size_t ku`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const T beta`: Input scalar constant. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xHEMV: Hermitian matrix-vector multiplication +------------- + +Same operation as xGEMV, but matrix A is an Hermitian matrix instead. + +C++ API: +``` +template +StatusCode Hemv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastChemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to HEMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const T beta`: Input scalar constant. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xHBMV: Hermitian banded matrix-vector multiplication +------------- + +Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead. + +C++ API: +``` +template +StatusCode Hbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastChbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to HBMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const size_t k`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const T beta`: Input scalar constant. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xHPMV: Hermitian packed matrix-vector multiplication +------------- + +Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP. + +C++ API: +``` +template +StatusCode Hpmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastChpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_float2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_double2 beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to HPMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix. +* `const size_t ap_offset`: The offset in elements from the start of the input AP matrix. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const T beta`: Input scalar constant. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSYMV: Symmetric matrix-vector multiplication +------------- + +Same operation as xGEMV, but matrix A is symmetric instead. + +C++ API: +``` +template +StatusCode Symv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSsymv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SYMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const T beta`: Input scalar constant. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSBMV: Symmetric banded matrix-vector multiplication +------------- + +Same operation as xGEMV, but matrix A is symmetric and banded instead. + +C++ API: +``` +template +StatusCode Sbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SBMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const size_t k`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const T beta`: Input scalar constant. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSPMV: Symmetric packed matrix-vector multiplication +------------- + +Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP. + +C++ API: +``` +template +StatusCode Spmv(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const T beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem ap_buffer, const size_t ap_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SPMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix. +* `const size_t ap_offset`: The offset in elements from the start of the input AP matrix. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const T beta`: Input scalar constant. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t y_offset`: The offset in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xTRMV: Triangular matrix-vector multiplication +------------- + +Same operation as xGEMV, but matrix A is triangular instead. + +C++ API: +``` +template +StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to TRMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const Diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal. +* `const size_t n`: Integer size argument. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `cl_mem x_buffer`: OpenCL buffer to store the output x vector. +* `const size_t x_offset`: The offset in elements from the start of the output x vector. +* `const size_t x_inc`: Stride/increment of the output x vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xTBMV: Triangular banded matrix-vector multiplication +------------- + +Same operation as xGEMV, but matrix A is triangular and banded instead. + +C++ API: +``` +template +StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, const size_t k, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to TBMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const Diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal. +* `const size_t n`: Integer size argument. +* `const size_t k`: Integer size argument. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `cl_mem x_buffer`: OpenCL buffer to store the output x vector. +* `const size_t x_offset`: The offset in elements from the start of the output x vector. +* `const size_t x_inc`: Stride/increment of the output x vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xTPMV: Triangular packed matrix-vector multiplication +------------- + +Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP. + +C++ API: +``` +template +StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem ap_buffer, const size_t ap_offset, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to TPMV: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const Diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal. +* `const size_t n`: Integer size argument. +* `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix. +* `const size_t ap_offset`: The offset in elements from the start of the input AP matrix. +* `cl_mem x_buffer`: OpenCL buffer to store the output x vector. +* `const size_t x_offset`: The offset in elements from the start of the output x vector. +* `const size_t x_inc`: Stride/increment of the output x vector. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xGER: General rank-1 matrix update +------------- + + + +C++ API: +``` +template +StatusCode Ger(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSger(const Layout layout, + const size_t m, const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDger(const Layout layout, + const size_t m, const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to GER: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const size_t m`: Integer size argument. +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. +* `const size_t y_offset`: The offset in elements from the start of the input y vector. +* `const size_t y_inc`: Stride/increment of the input y vector. +* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. +* `const size_t a_offset`: The offset in elements from the start of the output A matrix. +* `const size_t a_ld`: Leading dimension of the output A matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xGERU: General rank-1 complex matrix update +------------- + + + +C++ API: +``` +template +StatusCode Geru(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastCgeru(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZgeru(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to GERU: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const size_t m`: Integer size argument. +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. +* `const size_t y_offset`: The offset in elements from the start of the input y vector. +* `const size_t y_inc`: Stride/increment of the input y vector. +* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. +* `const size_t a_offset`: The offset in elements from the start of the output A matrix. +* `const size_t a_ld`: Leading dimension of the output A matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xGERC: General rank-1 complex conjugated matrix update +------------- + + + +C++ API: +``` +template +StatusCode Gerc(const Layout layout, + const size_t m, const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastCgerc(const Layout layout, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZgerc(const Layout layout, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to GERC: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const size_t m`: Integer size argument. +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. +* `const size_t y_offset`: The offset in elements from the start of the input y vector. +* `const size_t y_inc`: Stride/increment of the input y vector. +* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. +* `const size_t a_offset`: The offset in elements from the start of the output A matrix. +* `const size_t a_ld`: Leading dimension of the output A matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xHER: Hermitian rank-1 matrix update +------------- + + + +C++ API: +``` +template +StatusCode Her(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastCher(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZher(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to HER: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. +* `const size_t a_offset`: The offset in elements from the start of the output A matrix. +* `const size_t a_ld`: Leading dimension of the output A matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xHPR: Hermitian packed rank-1 matrix update +------------- + + + +C++ API: +``` +template +StatusCode Hpr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastChpr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to HPR: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix. +* `const size_t ap_offset`: The offset in elements from the start of the output AP matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xHER2: Hermitian rank-2 matrix update +------------- + + + +C++ API: +``` +template +StatusCode Her2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastCher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZher2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to HER2: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. +* `const size_t y_offset`: The offset in elements from the start of the input y vector. +* `const size_t y_inc`: Stride/increment of the input y vector. +* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. +* `const size_t a_offset`: The offset in elements from the start of the output A matrix. +* `const size_t a_ld`: Leading dimension of the output A matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xHPR2: Hermitian packed rank-2 matrix update +------------- + + + +C++ API: +``` +template +StatusCode Hpr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastChpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_float2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_double2 alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to HPR2: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. +* `const size_t y_offset`: The offset in elements from the start of the input y vector. +* `const size_t y_inc`: Stride/increment of the input y vector. +* `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix. +* `const size_t ap_offset`: The offset in elements from the start of the output AP matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSYR: Symmetric rank-1 matrix update +------------- + + + +C++ API: +``` +template +StatusCode Syr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSsyr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SYR: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. +* `const size_t a_offset`: The offset in elements from the start of the output A matrix. +* `const size_t a_ld`: Leading dimension of the output A matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSPR: Symmetric packed rank-1 matrix update +------------- + + + +C++ API: +``` +template +StatusCode Spr(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SPR: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix. +* `const size_t ap_offset`: The offset in elements from the start of the output AP matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSYR2: Symmetric rank-2 matrix update +------------- + + + +C++ API: +``` +template +StatusCode Syr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SYR2: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. +* `const size_t y_offset`: The offset in elements from the start of the input y vector. +* `const size_t y_inc`: Stride/increment of the input y vector. +* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix. +* `const size_t a_offset`: The offset in elements from the start of the output A matrix. +* `const size_t a_ld`: Leading dimension of the output A matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSPR2: Symmetric packed rank-2 matrix update +------------- + + + +C++ API: +``` +template +StatusCode Spr2(const Layout layout, const Triangle triangle, + const size_t n, + const T alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, + const size_t n, + const float alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, + const size_t n, + const double alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SPR2: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t x_offset`: The offset in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. +* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector. +* `const size_t y_offset`: The offset in elements from the start of the input y vector. +* `const size_t y_inc`: Stride/increment of the input y vector. +* `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix. +* `const size_t ap_offset`: The offset in elements from the start of the output AP matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xGEMM: General matrix-matrix multiplication +------------- + + + +C++ API: +``` +template +StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const size_t m, const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to GEMM: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const Transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const size_t m`: Integer size argument. +* `const size_t n`: Integer size argument. +* `const size_t k`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. +* `const size_t b_offset`: The offset in elements from the start of the input B matrix. +* `const size_t b_ld`: Leading dimension of the input B matrix. +* `const T beta`: Input scalar constant. +* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. +* `const size_t c_offset`: The offset in elements from the start of the output C matrix. +* `const size_t c_ld`: Leading dimension of the output C matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSYMM: Symmetric matrix-matrix multiplication +------------- + + + +C++ API: +``` +template +StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SYMM: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Side`: The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142). +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t m`: Integer size argument. +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. +* `const size_t b_offset`: The offset in elements from the start of the input B matrix. +* `const size_t b_ld`: Leading dimension of the input B matrix. +* `const T beta`: Input scalar constant. +* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. +* `const size_t c_offset`: The offset in elements from the start of the output C matrix. +* `const size_t c_ld`: Leading dimension of the output C matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xHEMM: Hermitian matrix-matrix multiplication +------------- + + + +C++ API: +``` +template +StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle triangle, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to HEMM: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Side`: The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142). +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const size_t m`: Integer size argument. +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. +* `const size_t b_offset`: The offset in elements from the start of the input B matrix. +* `const size_t b_ld`: Leading dimension of the input B matrix. +* `const T beta`: Input scalar constant. +* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. +* `const size_t c_offset`: The offset in elements from the start of the output C matrix. +* `const size_t c_ld`: Leading dimension of the output C matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSYRK: Rank-K update of a symmetric matrix +------------- + + + +C++ API: +``` +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SYRK: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const size_t n`: Integer size argument. +* `const size_t k`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const T beta`: Input scalar constant. +* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. +* `const size_t c_offset`: The offset in elements from the start of the output C matrix. +* `const size_t c_ld`: Leading dimension of the output C matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xHERK: Rank-K update of a hermitian matrix +------------- + + + +C++ API: +``` +template +StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to HERK: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const size_t n`: Integer size argument. +* `const size_t k`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const T beta`: Input scalar constant. +* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. +* `const size_t c_offset`: The offset in elements from the start of the output C matrix. +* `const size_t c_ld`: Leading dimension of the output C matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xSYR2K: Rank-2K update of a symmetric matrix +------------- + + + +C++ API: +``` +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_float2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const cl_double2 beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to SYR2K: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const Transpose`: Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const size_t n`: Integer size argument. +* `const size_t k`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. +* `const size_t b_offset`: The offset in elements from the start of the input B matrix. +* `const size_t b_ld`: Leading dimension of the input B matrix. +* `const T beta`: Input scalar constant. +* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. +* `const size_t c_offset`: The offset in elements from the start of the output C matrix. +* `const size_t c_ld`: Leading dimension of the output C matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xHER2K: Rank-2K update of a hermitian matrix +------------- + + + +C++ API: +``` +template +StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const U beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to HER2K: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const Transpose`: Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const size_t n`: Integer size argument. +* `const size_t k`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix. +* `const size_t b_offset`: The offset in elements from the start of the input B matrix. +* `const size_t b_ld`: Leading dimension of the input B matrix. +* `const U beta`: Input scalar constant. +* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix. +* `const size_t c_offset`: The offset in elements from the start of the output C matrix. +* `const size_t c_ld`: Leading dimension of the output C matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + +xTRMM: Triangular matrix-matrix multiplication +------------- + + + +C++ API: +``` +template +StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +StatusCode CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastDtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastCtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to TRMM: + +* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Side`: The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142). +* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const Diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal. +* `const size_t m`: Integer size argument. +* `const size_t n`: Integer size argument. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. +* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix. +* `const size_t b_offset`: The offset in elements from the start of the output B matrix. +* `const size_t b_ld`: Leading dimension of the output B matrix. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 75c0a093..47972714 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -18,6 +18,7 @@ # It also generates the main functions for the correctness and performance tests as found in # test/correctness/routines/levelX/xYYYY.cc # test/performance/routines/levelX/xYYYY.cc +# It also produces the API documentation found in doc/clblast.md # # ================================================================================================== @@ -59,62 +60,62 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # Populates a list of routines routines = [ [ # Level 1: vector-vector - Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"), - Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation"), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"), - Routine(True, True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"), - Routine(True, True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling"), - Routine(True, True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy"), - Routine(True, True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector"), - Routine(True, True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors"), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors"), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated"), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector"), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector"), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)"), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector"), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)"), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)"), + Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), + Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []), + Routine(True, True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication"), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication"), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication"), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication"), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication"), - Routine(True, True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication"), - Routine(True, True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication"), - Routine(True, True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication"), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication"), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication"), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication"), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations"), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations"), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations"), + Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []), + Routine(True, True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []), + Routine(True, True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []), + Routine(True, True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update"), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update"), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update"), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update"), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update"), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update"), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update"), - Routine(True, True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update"), - Routine(True, True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update"), - Routine(True, True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update"), - Routine(True, True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update"), + Routine(True, True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []), + Routine(True, True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []), + Routine(True, True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []), + Routine(True, True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []), + Routine(True, True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []), ], [ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication"), - Routine(True, True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication"), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication"), - Routine(True, True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix"), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix"), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix"), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix"), - Routine(True, True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication"), - Routine(False, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations"), + Routine(True, True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []), + Routine(True, True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []), + Routine(True, True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []), + Routine(True, True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []), + Routine(False, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), ]] # ================================================================================================== @@ -401,3 +402,61 @@ for level in [1,2,3]: f.write(footer) # ================================================================================================== + +# Outputs the API documentation +filename = path_clblast+"/doc/clblast.md" +with open(filename, "w") as f: + + # Outputs the header + f.write("CLBlast: API reference\n") + f.write("================\n") + f.write("\n\n") + + # Loops over the routines + for level in [1,2,3]: + for routine in routines[level-1]: + if routine.implemented: + + # Routine header + f.write("x"+routine.name.upper()+": "+routine.description+"\n") + f.write("-------------\n") + f.write("\n") + f.write(routine.details+"\n") + f.write("\n") + + # Routine API + f.write("C++ API:\n") + f.write("```\n") + f.write(routine.RoutineHeaderCPP(12, "")+"\n") + f.write("```\n") + f.write("\n") + f.write("C API:\n") + f.write("```\n") + for flavour in routine.flavours: + f.write(routine.RoutineHeaderC(flavour, 20, "")+"\n") + f.write("```\n") + f.write("\n") + + # Routine arguments + f.write("Arguments to "+routine.name.upper()+":\n") + f.write("\n") + for argument in routine.ArgumentsDoc(): + f.write("* "+argument+"\n") + f.write("* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.\n") + f.write("* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.\n") + f.write("\n") + + # Routine requirements + if len(routine.RequirementsDoc()) > 0: + f.write("Requirements for "+routine.name.upper()+":\n") + f.write("\n") + for requirement in routine.RequirementsDoc(): + f.write("* "+requirement+"\n") + f.write("\n") + + + # Routine footer + f.write("\n\n") + + +# ================================================================================================== diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 95681da6..e5059c61 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -51,12 +51,24 @@ def OptionToWrapperC(x): 'diagonal': "CBLAS_DIAG", }[x] +# Translates an option name to a documentation string +def OptionToDoc(x): + return { + 'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.", + 'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", + 'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", + 'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.", + 'side': "The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142).", + 'triangle': "The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).", + 'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal.", + }[x] + # ================================================================================================== # Class holding routine-specific information (e.g. name, which arguments, which precisions) class Routine(): def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options, - inputs, outputs, scalars, scratch, description): + inputs, outputs, scalars, scratch, description, details, requirements): self.implemented = implemented self.has_tests = has_tests self.level = level @@ -70,6 +82,8 @@ class Routine(): self.scalars = scalars self.scratch = scratch # Scratch buffer (e.g. for xDOT) self.description = description + self.details = details + self.requirements = requirements # List of scalar buffers def ScalarBuffersFirst(self): @@ -115,6 +129,12 @@ class Routine(): return ["ap","a","b","c"] return ["y","c"] + # Distinguish between vectors and matrices + def BuffersVector(self): + return ["x","y"] + def BuffersMatrix(self): + return ["a","b","c","ap"] + # ============================================================================================== # Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x') @@ -197,6 +217,19 @@ class Routine(): return [", ".join(a+b+c)] return [] + # Retrieves the documentation of the buffers + def BufferDoc(self, name): + prefix = "const " if (name in self.inputs) else "" + inout = "input" if (name in self.inputs) else "output" + if (name in self.inputs) or (name in self.outputs): + math_name = name.upper()+" matrix" if (name in self.BuffersMatrix()) else name+" vector" + incld_description = "Leading dimension " if (name in self.BuffersMatrix()) else "Stride/increment " + a = ["`"+prefix+"cl_mem "+name+"_buffer`: OpenCL buffer to store the "+inout+" "+math_name+"."] + b = ["`const size_t "+name+"_offset`: The offset in elements from the start of the "+inout+" "+math_name+"."] + c = ["`const size_t "+name+"_"+self.Postfix(name)+"`: "+incld_description+"of the "+inout+" "+math_name+"."] if (name not in self.BuffersWithoutLdInc()) else [] + return a+b+c + return [] + # ============================================================================================== # Retrieves the name of a scalar (alpha/beta) @@ -257,6 +290,14 @@ class Routine(): return ["const "+flavour.beta_cpp] return [] + # Retrieves the documentation of a scalar + def ScalarDoc(self, name): + if name in self.scalars: + if name == "alpha": + return ["`const "+self.template.alpha_cpp+" "+name+"`: Input scalar constant."] + return ["`const "+self.template.beta_cpp+" "+name+"`: Input scalar constant."] + return [] + # ============================================================================================== # Retrieves a list of comma-separated sizes (m, n, k) @@ -277,6 +318,13 @@ class Routine(): return [", ".join(["const size_t" for s in self.sizes])] return [] + # Retrieves the documentation of the sizes + def SizesDoc(self): + if self.sizes: + definitions = ["`const size_t "+s+"`: Integer size argument." for s in self.sizes] + return definitions + return [] + # ============================================================================================== # Retrieves a list of options @@ -320,6 +368,13 @@ class Routine(): return [", ".join(definitions)] return [] + # Retrieves the documentation of the options + def OptionsDoc(self): + if self.options: + definitions = ["`const "+OptionToCLBlast(o)+"`: "+OptionToDoc(o) for o in self.options] + return definitions + return [] + # ============================================================================================== # Retrieves a combination of all the argument names, with Claduc casts @@ -408,6 +463,24 @@ class Routine(): list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) + list(chain(*[self.BufferType(b) for b in self.ScalarBuffersSecond()])) + list(chain(*[self.ScalarType(s, flavour) for s in self.OtherScalars()]))) + + # Retrieves a combination of all the argument types + def ArgumentsDoc(self): + return (self.OptionsDoc() + self.SizesDoc() + + list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) + + list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) + + self.ScalarDoc("alpha") + + list(chain(*[self.BufferDoc(b) for b in self.BuffersFirst()])) + + self.ScalarDoc("beta") + + list(chain(*[self.BufferDoc(b) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarDoc(s) for s in self.OtherScalars()]))) + + # ============================================================================================== + + # Retrieves a list of routine requirements for documentation + def RequirementsDoc(self): + return [] # ============================================================================================== From 1acb31896c2e6cabea2b2d8fe9511d3726743b54 Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 8 May 2016 10:06:06 +0200 Subject: [PATCH 55/60] Fixed an issue with computing the GFLOPS numbers for the xGEMM performance tests for non-square matrices --- test/routines/level3/xgemm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h index 10fc2803..cd5c2acd 100644 --- a/test/routines/level3/xgemm.h +++ b/test/routines/level3/xgemm.h @@ -153,10 +153,10 @@ class TestXgemm { // Describes how to compute performance metrics static size_t GetFlops(const Arguments &args) { - return 2 * args.m * args.n * args.m; + return 2 * args.m * args.n * args.k; } static size_t GetBytes(const Arguments &args) { - return (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); + return (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T); } }; From 25a25dbd6f6065420392e59c726902e05c0d4a5a Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 8 May 2016 17:30:31 +0200 Subject: [PATCH 56/60] Fixed errors in xAXPY and xSCAL tests on AMD hardware --- src/kernels/level1/xaxpy.opencl | 7 +++++-- src/kernels/level1/xscal.opencl | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index 1f1e8ce0..574beb43 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -30,7 +30,8 @@ __kernel void Xaxpy(const int n, const real alpha, // Loops over the work that needs to be done (allows for an arbitrary number of threads) #pragma unroll for (int id = get_global_id(0); id Date: Sun, 8 May 2016 18:07:55 +0200 Subject: [PATCH 57/60] Fixed an issue where the xNRM2 and xASUM testers would incorrectly report failures for complex inputs --- scripts/generator/generator.py | 10 ++++++++-- test/wrapper_cblas.h | 16 ++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 47972714..874074d1 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -266,6 +266,7 @@ def wrapper_cblas(routines): # Special case for scalar outputs assignment = "" postfix = "" + endofline = "" extra_argument = "" for output_buffer in routine.outputs: if output_buffer in routine.ScalarBuffersFirst(): @@ -274,12 +275,17 @@ def wrapper_cblas(routines): indent += " " extra_argument += ",\n"+indent+"reinterpret_cast(&"+output_buffer+"_buffer["+output_buffer+"_offset])" else: - assignment = output_buffer+"_buffer["+output_buffer+"_offset] = " + assignment = output_buffer+"_buffer["+output_buffer+"_offset]" + if (flavour.name in ["Sc","Dz"]): + assignment = assignment+".real(" + endofline += ")" + else: + assignment = assignment+" = " indent += " "*len(assignment) result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"(" result += (",\n"+indent).join([a for a in arguments]) - result += extra_argument+");" + result += extra_argument+endofline+");" result += "\n}\n" return result diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index 566c90e5..1b6977c0 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -338,14 +338,14 @@ void cblasXnrm2(const size_t n, void cblasXnrm2(const size_t n, std::vector& nrm2_buffer, const size_t nrm2_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - nrm2_buffer[nrm2_offset] = cblas_scnrm2(n, - reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); + nrm2_buffer[nrm2_offset].real(cblas_scnrm2(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); } void cblasXnrm2(const size_t n, std::vector& nrm2_buffer, const size_t nrm2_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - nrm2_buffer[nrm2_offset] = cblas_dznrm2(n, - reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); + nrm2_buffer[nrm2_offset].real(cblas_dznrm2(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); } // Forwards the Netlib BLAS calls for SASUM/DASUM/ScASUM/DzASUM @@ -364,14 +364,14 @@ void cblasXasum(const size_t n, void cblasXasum(const size_t n, std::vector& asum_buffer, const size_t asum_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - asum_buffer[asum_offset] = cblas_scasum(n, - reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); + asum_buffer[asum_offset].real(cblas_scasum(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); } void cblasXasum(const size_t n, std::vector& asum_buffer, const size_t asum_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - asum_buffer[asum_offset] = cblas_dzasum(n, - reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); + asum_buffer[asum_offset].real(cblas_dzasum(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc))); } // Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX From 3b81ee2c088355d60aabf1e9d919384fb47b808e Mon Sep 17 00:00:00 2001 From: cnugteren Date: Sun, 8 May 2016 18:28:01 +0200 Subject: [PATCH 58/60] Fixed an issue where the xAMAX tester would incorrectly report failures when testing against CBLAS --- scripts/generator/generator.py | 3 +++ test/wrapper_cblas.h | 16 ++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 874074d1..210f371f 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -274,6 +274,9 @@ def wrapper_cblas(routines): postfix += "_sub" indent += " " extra_argument += ",\n"+indent+"reinterpret_cast(&"+output_buffer+"_buffer["+output_buffer+"_offset])" + elif output_buffer in routine.IndexBuffers(): + assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = " + indent += " "*len(assignment) else: assignment = output_buffer+"_buffer["+output_buffer+"_offset]" if (flavour.name in ["Sc","Dz"]): diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index 1b6977c0..529acfbf 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -378,26 +378,26 @@ void cblasXasum(const size_t n, void cblasXamax(const size_t n, std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - imax_buffer[imax_offset] = cblas_isamax(n, - &x_buffer[x_offset], static_cast(x_inc)); + ((int*)&imax_buffer[0])[imax_offset] = cblas_isamax(n, + &x_buffer[x_offset], static_cast(x_inc)); } void cblasXamax(const size_t n, std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - imax_buffer[imax_offset] = cblas_idamax(n, - &x_buffer[x_offset], static_cast(x_inc)); + ((int*)&imax_buffer[0])[imax_offset] = cblas_idamax(n, + &x_buffer[x_offset], static_cast(x_inc)); } void cblasXamax(const size_t n, std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - imax_buffer[imax_offset] = cblas_icamax(n, - reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); + ((int*)&imax_buffer[0])[imax_offset] = cblas_icamax(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } void cblasXamax(const size_t n, std::vector& imax_buffer, const size_t imax_offset, const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { - imax_buffer[imax_offset] = cblas_izamax(n, - reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); + ((int*)&imax_buffer[0])[imax_offset] = cblas_izamax(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); } // ================================================================================================= From c5730c8b438c2c2a03f3f93c14b111877b98a03f Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 8 May 2016 20:29:41 +0200 Subject: [PATCH 59/60] Updated to version 0.7.0 --- CHANGELOG | 2 +- CMakeLists.txt | 2 +- README.md | 2 +- samples/sasum.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 51f529d3..c77e5e48 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,5 @@ -Development version (next release) +Version 0.7.0 - Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) - Made the library thread-safe - Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries diff --git a/CMakeLists.txt b/CMakeLists.txt index d63105e4..44524537 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ cmake_minimum_required(VERSION 2.8.10) project("clblast" C CXX) set(clblast_VERSION_MAJOR 0) -set(clblast_VERSION_MINOR 6) +set(clblast_VERSION_MINOR 7) set(clblast_VERSION_PATCH 0) # Options and their default values diff --git a/README.md b/README.md index ae236622..8d9220a6 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ Or alternatively the plain C version: #include -Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file. Additionally, a couple of stand-alone example programs are included in `samples/`. +Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/api.md). Additionally, a couple of stand-alone example programs are included in `samples/`. Using the tuners (optional) diff --git a/samples/sasum.c b/samples/sasum.c index 3b20d301..3fdbb0eb 100644 --- a/samples/sasum.c +++ b/samples/sasum.c @@ -79,7 +79,7 @@ int main(void) { clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL); // Example completed. See "clblast_c.h" for status codes (0 -> success). - printf("Completed SASUM with status %d: %d * |%.1lf| = %.1lf\n", status, n, input_value, host_output[0]); + printf("Completed SASUM with status %d: %zu * |%.1lf| = %.1lf\n", status, n, input_value, host_output[0]); // Clean-up free(platforms); From 942912daeb4e1d84820d813c1d3c03eae5361449 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sun, 8 May 2016 21:11:37 +0200 Subject: [PATCH 60/60] Fixes for compilation of the tests under Visual Studio 2015 --- test/correctness/testblas.cc | 4 ++-- test/correctness/tester.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index 1f83c59b..e70c0361 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -145,8 +145,8 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st if (!TestSimilarity(result1[index], result2[index])) { errors++; if (verbose_) { - if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %lu: ", id1); } - else { fprintf(stdout, "\n Error at %lu,%lu: ", id1, id2); } + if (get_id2_(args) == 1) { fprintf(stdout, "\n Error at index %zu: ", id1); } + else { fprintf(stdout, "\n Error at %zu,%zu: ", id1, id2); } std::cout << result1[index]; fprintf(stdout, " (reference) versus "); std::cout << result2[index]; diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 82926c3c..26c4ba59 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -333,8 +333,8 @@ bool TestSimilarity(const T val1, const T val2) { const auto difference = std::fabs(val1 - val2); // Set the allowed error margin for floating-point comparisons - constexpr auto kErrorMarginRelative = T{0.025}; - constexpr auto kErrorMarginAbsolute = T{1.0e-4}; + constexpr auto kErrorMarginRelative = T(0.025); + constexpr auto kErrorMarginAbsolute = T(1.0e-4); // Shortcut, handles infinities if (val1 == val2) {