diff --git a/CHANGELOG b/CHANGELOG index c52e041d..db14f037 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ Development version (next release) - Added exports to be able to create a DLL on Windows (thanks to Marco Hutter) - Made the library thread-safe +- Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries - Fixed the use of events within the library - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 diff --git a/CMakeLists.txt b/CMakeLists.txt index 8316a49a..21254ded 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,7 +66,7 @@ else () set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable") endif() elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") - set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") + set(FLAGS "${FLAGS} -Wall -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded") set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch") set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn") endif() @@ -98,11 +98,13 @@ if(TUNERS) endif() endif() -# Locates the clBLAS library in case the tests need to be compiled. "FindclBLAS.cmake" is included. +# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake" +# and "FindCBLAS.cmake" are included. if(TESTS) find_package(clBLAS) - if(NOT CLBLAS_FOUND) - message(STATUS "Could NOT find clBLAS, disabling the compilation of the tests") + find_package(CBLAS) + if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND) + message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests") set(TESTS OFF) endif() endif() @@ -215,11 +217,33 @@ endif() # ================================================================================================== # Down from here is all test (performance and correctness) related. Note that these tests require -# the presence of the clBLAS library to act as a reference. +# the presence of clBLAS and/or a BLAS library to act as a reference. if(TESTS) - # Adds new include directories for the reference clBLAS - include_directories(${clblast_SOURCE_DIR}/test ${CLBLAS_INCLUDE_DIRS}) + # Sets the specifics for the reference BLAS libraries + set(REF_INCLUDES ) + set(REF_LIBRARIES ) + if(CLBLAS_FOUND) + set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS}) + set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES}) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + add_definitions(" /DCLBLAST_REF_CLBLAS") + else() + add_definitions(" -DCLBLAST_REF_CLBLAS") + endif() + endif() + if(CBLAS_FOUND) + set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS}) + set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES}) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + add_definitions(" /DCLBLAST_REF_CBLAS") + else() + add_definitions(" -DCLBLAST_REF_CBLAS") + endif() + endif() + + # Sets the include directories + include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES}) # Creates the common correctness-tests objects (requires CMake 2.8.8) add_library(test_correctness_common OBJECT @@ -239,7 +263,7 @@ if(TESTS) test/correctness/routines/level3/${ROUTINE}.cc) endforeach() foreach(ROUTINE ${ROUTINES}) - target_link_libraries(clblast_test_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) install(TARGETS clblast_test_${ROUTINE} DESTINATION bin) endforeach() @@ -269,7 +293,7 @@ if(TESTS) test/performance/routines/level3/${ROUTINE}.cc) endforeach() foreach(ROUTINE ${ROUTINES}) - target_link_libraries(clblast_client_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES}) + target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES}) install(TARGETS clblast_client_${ROUTINE} DESTINATION bin) endforeach() diff --git a/README.md b/README.md index ac614026..d69ad552 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,14 @@ The pre-requisites for compilation of CLBlast are: - Intel OpenCL - Beignet +Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either: + +* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD) +* A regular CPU Netlib BLAS library, e.g.: + - OpenBLAS + - BLIS + - Accelerate + An example of an out-of-source build (starting from the root of the CLBlast folder): mkdir build @@ -135,9 +143,9 @@ To make sure CLBlast is working correctly on your device (recommended), compile cmake -DTESTS=ON .. -Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. The library clBLAS is therefore required to be installed on your system for the CLBlast tests. +Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against. -With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test. +With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library. Performance remarks @@ -249,4 +257,3 @@ To-do list before release of version 1.0 - Support all routines supported by clBLAS - Allow the user control over events and synchronization - Add half-precision routines (e.g. HGEMM) -- Enable correctness and performance testing against a CPU-based BLAS library diff --git a/cmake/Modules/FindCBLAS.cmake b/cmake/Modules/FindCBLAS.cmake new file mode 100644 index 00000000..86f14515 --- /dev/null +++ b/cmake/Modules/FindCBLAS.cmake @@ -0,0 +1,75 @@ + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +# width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# ================================================================================================== +# +# Defines the following variables: +# CBLAS_FOUND Boolean holding whether or not the Netlib BLAS library was found +# CBLAS_INCLUDE_DIRS The Netlib BLAS include directory +# CBLAS_LIBRARIES The Netlib BLAS library +# +# In case BLAS is not installed in the default directory, set the CBLAS_ROOT variable to point to +# the root of BLAS, such that 'cblas.h' can be found in $CBLAS_ROOT/include. This can either be +# done using an environmental variable (e.g. export CBLAS_ROOT=/path/to/BLAS) or using a CMake +# variable (e.g. cmake -DCBLAS_ROOT=/path/to/BLAS ..). +# +# ================================================================================================== + +# Sets the possible install locations +set(CBLAS_HINTS + ${CBLAS_ROOT} + $ENV{CBLAS_ROOT} +) +set(CBLAS_PATHS + /usr + /usr/local + /usr/local/opt + /System/Library/Frameworks +) + +# Finds the include directories +find_path(CBLAS_INCLUDE_DIRS + NAMES cblas.h + HINTS ${CBLAS_HINTS} + PATH_SUFFIXES + include inc include/x86_64 include/x64 + openblas/include include/blis blis/include blis/include/blis + Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers + PATHS ${CBLAS_PATHS} + DOC "Netlib BLAS include header cblas.h" +) +mark_as_advanced(CBLAS_INCLUDE_DIRS) + +# Finds the library +find_library(CBLAS_LIBRARIES + NAMES blas mkl blis openblas atlas accelerate + HINTS ${CBLAS_HINTS} + PATH_SUFFIXES + lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import + openblas/lib blis/lib + PATHS ${CBLAS_PATHS} + DOC "Netlib BLAS library" +) +mark_as_advanced(CBLAS_LIBRARIES) + +# ================================================================================================== + +# Notification messages +if(NOT CBLAS_INCLUDE_DIRS) + message(STATUS "Could NOT find 'cblas.h', install a CPU Netlib BLAS or set CBLAS_ROOT") +endif() +if(NOT CBLAS_LIBRARIES) + message(STATUS "Could NOT find a CPU Netlib BLAS library, install it or set CBLAS_ROOT") +endif() + +# Determines whether or not BLAS was found +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES) + +# ================================================================================================== diff --git a/include/clblast.h b/include/clblast.h index 5e5c5a46..431f2510 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -100,7 +100,7 @@ template StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event = nullptr); diff --git a/include/clblast_c.h b/include/clblast_c.h index dcb3ae3a..f72cff3a 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -112,13 +112,13 @@ StatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, StatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); StatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index aac66396..00905ef7 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -465,31 +465,33 @@ class Buffer { } // Copies from device to host: reading the device buffer a-synchronously - void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) { + void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); } CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T), host, 0, nullptr, nullptr)); } void ReadAsync(const Queue &queue, const size_t size, std::vector &host, - const size_t offset = 0) { + const size_t offset = 0) const { if (host.size() < size) { Error("target host buffer is too small"); } ReadAsync(queue, size, host.data(), offset); } void ReadAsync(const Queue &queue, const size_t size, BufferHost &host, - const size_t offset = 0) { + const size_t offset = 0) const { if (host.size() < size) { Error("target host buffer is too small"); } ReadAsync(queue, size, host.data(), offset); } // Copies from device to host: reading the device buffer - void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) { + void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const { ReadAsync(queue, size, host, offset); queue.Finish(); } - void Read(const Queue &queue, const size_t size, std::vector &host, const size_t offset = 0) { + void Read(const Queue &queue, const size_t size, std::vector &host, + const size_t offset = 0) const { Read(queue, size, host.data(), offset); } - void Read(const Queue &queue, const size_t size, BufferHost &host, const size_t offset = 0) { + void Read(const Queue &queue, const size_t size, BufferHost &host, + const size_t offset = 0) const { Read(queue, size, host.data(), offset); } diff --git a/include/internal/utilities.h b/include/internal/utilities.h index 35f76722..6adc1d0a 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -35,6 +35,9 @@ using double2 = std::complex; const std::string kKhronosHalfPrecision = "cl_khr_fp16"; const std::string kKhronosDoublePrecision = "cl_khr_fp64"; +// Catched an unknown error +constexpr auto kUnknownError = -999; + // ================================================================================================= // The routine-specific arguments in string form @@ -70,6 +73,7 @@ constexpr auto kArgFraction = "fraction"; // The client-specific arguments in string form constexpr auto kArgCompareclblas = "clblas"; +constexpr auto kArgComparecblas = "cblas"; constexpr auto kArgStepSize = "step"; constexpr auto kArgNumSteps = "num_steps"; constexpr auto kArgNumRuns = "runs"; @@ -128,6 +132,7 @@ struct Arguments { double fraction = 1.0; // Client-specific arguments int compare_clblas = 1; + int compare_cblas = 1; size_t step = 1; size_t num_steps = 0; size_t num_runs = 10; diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py index 9323bc4d..5a58ab53 100644 --- a/scripts/generator/datatype.py +++ b/scripts/generator/datatype.py @@ -58,5 +58,10 @@ class DataType(): return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp + # Current scalar is complex + def IsComplex(self, scalar): + return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or + (scalar == "beta" and self.beta_cpp in [FLT2, DBL2])) + # ================================================================================================== diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 6e2b2ed2..bdf6b9d7 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -8,12 +8,13 @@ # Cedric Nugteren # # This script automatically generates the bodies of the following files, creating the full CLBlast -# API interface and implementation (C, C++, and clBLAS wrapper): +# API interface and implementation (C, C++, and reference BLAS wrappers): # clblast.h # clblast.cc # clblast_c.h # clblast_c.cc # wrapper_clblas.h +# wrapper_cblas.h # It also generates the main functions for the correctness and performance tests as found in # test/correctness/routines/levelX/xYYYY.cc # test/performance/routines/levelX/xYYYY.cc @@ -55,7 +56,7 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") routines = [ [ # Level 1: vector-vector Routine(False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"), - Routine(False, "1", "rotmg", T, [S,D], [], [], [], ["sd1","sd2","sx1","sy1","sparam"], [], "", "Generate modified givens plane rotation"), + Routine(False, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation"), Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"), Routine(False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"), Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"), @@ -220,11 +221,11 @@ def wrapper_clblas(routines): for routine in routines: result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames()) if routine.NoScalars(): - result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n" + result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n" for flavour in routine.flavours: indent = " "*(17 + routine.Length()) - result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n" - arguments = routine.ArgumentsWrapper(flavour) + result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n" + arguments = routine.ArgumentsWrapperCL(flavour) if routine.scratch: result += " auto queue = Queue(queues[0]);\n" result += " auto context = queue.GetContext();\n" @@ -236,6 +237,41 @@ def wrapper_clblas(routines): result += "\n}\n" return result +# The wrapper to the reference CBLAS routines (for performance/correctness testing) +def wrapper_cblas(routines): + result = "" + for routine in routines: + result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames()) + for flavour in routine.flavours: + indent = " "*(10 + routine.Length()) + result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n" + arguments = routine.ArgumentsWrapperC(flavour) + + # Double-precision scalars + for scalar in routine.scalars: + if flavour.IsComplex(scalar): + result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n" + + # Special case for scalar outputs + assignment = "" + postfix = "" + extra_argument = "" + for output_buffer in routine.outputs: + if output_buffer in routine.ScalarBuffersFirst(): + if flavour in [C,Z]: + postfix += "_sub" + indent += " " + extra_argument += ",\n"+indent+"reinterpret_cast(&"+output_buffer+"_buffer["+output_buffer+"_offset])" + else: + assignment = output_buffer+"_buffer["+output_buffer+"_offset] = " + indent += " "*len(assignment) + + result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"(" + result += (",\n"+indent).join([a for a in arguments]) + result += extra_argument+");" + result += "\n}\n" + return result + # ================================================================================================== # Checks for the number of command-line arguments @@ -251,9 +287,10 @@ files = [ path_clblast+"/include/clblast_c.h", path_clblast+"/src/clblast_c.cc", path_clblast+"/test/wrapper_clblas.h", + path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 65, 93, 22, 22] -footer_lines = [6, 3, 9, 2, 6] +header_lines = [84, 65, 93, 22, 22, 38] +footer_lines = [6, 3, 9, 2, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise for f in files: @@ -287,6 +324,8 @@ for i in xrange(0,len(files)): body += clblast_c_cc(routines[level-1]) if i == 4: body += wrapper_clblas(routines[level-1]) + if i == 5: + body += wrapper_cblas(routines[level-1]) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 02040583..fffa19f6 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -28,7 +28,7 @@ def OptionToCLBlast(x): }[x] # As above, but for clBLAS data-types -def OptionToWrapper(x): +def OptionToWrapperCL(x): return { 'layout': "clblasOrder", 'a_transpose': "clblasTranspose", @@ -39,6 +39,18 @@ def OptionToWrapper(x): 'diagonal': "clblasDiag", }[x] +# As above, but for CBLAS data-types +def OptionToWrapperC(x): + return { + 'layout': "CBLAS_ORDER", + 'a_transpose': "CBLAS_TRANSPOSE", + 'b_transpose': "CBLAS_TRANSPOSE", + 'ab_transpose': "CBLAS_TRANSPOSE", + 'side': "CBLAS_SIDE", + 'triangle': "CBLAS_UPLO", + 'diagonal': "CBLAS_DIAG", + }[x] + # ================================================================================================== # Class holding routine-specific information (e.g. name, which arguments, which precisions) @@ -119,6 +131,16 @@ class Routine(): return [", ".join(a+b+c)] return [] + # As above but as vectors + def BufferDefVector(self, name, flavour): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"] + b = ["const size_t "+name+"_offset"] + c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] + return [", ".join(a+b+c)] + return [] + # As above but with Claduc buffers def BufferCladuc(self, name): if (name in self.inputs) or (name in self.outputs): @@ -129,7 +151,7 @@ class Routine(): return [] # As above but with a static cast for clBLAS wrapper - def BufferWrapper(self, name): + def BufferWrapperCL(self, name): if (name in self.inputs) or (name in self.outputs): a = [name+"_buffer"] b = [name+"_offset"] @@ -141,6 +163,24 @@ class Routine(): return [", ".join(a+b+c)] return [] + # As above but with a static cast for CBLAS wrapper + def BufferWrapperC(self, name, flavour): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + if name == "sy1": + a = [name+"_buffer["+name+"_offset]"] + elif flavour.precision_name in ["C","Z"]: + a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"] + else: + a = ["&"+name+"_buffer["+name+"_offset]"] + c = [] + if (name in ["x","y"]): + c = ["static_cast("+name+"_"+self.Postfix(name)+")"] + elif (name in ["a","b","c"]): + c = [name+"_"+self.Postfix(name)] + return [", ".join(a+c)] + return [] + # As above, but only data-types def BufferType(self, name): prefix = "const " if (name in self.inputs) else "" @@ -179,6 +219,14 @@ class Routine(): return [name] return [] + # Retrieves the use of a scalar for CBLAS (alpha/beta) + def ScalarUseWrapperC(self, name, flavour): + if name in self.scalars: + if flavour.IsComplex(name): + return [name+"_array.data()"] + return [name] + return [] + # Retrieves the definition of a scalar (alpha/beta) def ScalarDef(self, name, flavour): if name in self.scalars: @@ -246,9 +294,16 @@ class Routine(): return [] # As above, but now using clBLAS data-types - def OptionsDefWrapper(self): + def OptionsDefWrapperCL(self): if self.options: - definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options] + definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options] + return [", ".join(definitions)] + return [] + + # As above, but now using CBLAS data-types + def OptionsDefWrapperC(self): + if self.options: + definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options] return [", ".join(definitions)] return [] @@ -284,16 +339,26 @@ class Routine(): list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()]))) # As above, but for the clBLAS wrapper - def ArgumentsWrapper(self, flavour): + def ArgumentsWrapperCL(self, flavour): return (self.Options() + self.Sizes() + - list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersFirst()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) + self.ScalarUseWrapper("alpha", flavour) + - list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) + self.ScalarUseWrapper("beta", flavour) + - list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) + list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()]))) + # As above, but for the CBLAS wrapper + def ArgumentsWrapperC(self, flavour): + return (self.Options() + self.Sizes() + + self.ScalarUseWrapperC("alpha", flavour) + + list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) + + self.ScalarUseWrapperC("beta", flavour) + + list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()]))) + # Retrieves a combination of all the argument definitions def ArgumentsDef(self, flavour): return (self.OptionsDef() + self.SizesDef() + @@ -306,8 +371,8 @@ class Routine(): list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()]))) # As above, but clBLAS wrapper plain datatypes - def ArgumentsDefWrapper(self, flavour): - return (self.OptionsDefWrapper() + self.SizesDef() + + def ArgumentsDefWrapperCL(self, flavour): + return (self.OptionsDefWrapperCL() + self.SizesDef() + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) + self.ScalarDefPlain("alpha", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + @@ -315,6 +380,17 @@ class Routine(): list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) + list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) + + # As above, but CBLAS wrapper plain datatypes + def ArgumentsDefWrapperC(self, flavour): + return (self.OptionsDefWrapperC() + self.SizesDef() + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) + + self.ScalarDefPlain("alpha", flavour) + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) + + self.ScalarDefPlain("beta", flavour) + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) # Retrieves a combination of all the argument types def ArgumentsType(self, flavour): @@ -356,7 +432,7 @@ class Routine(): return result # As above, but now for the clBLAS wrapper - def RoutineHeaderWrapper(self, flavour, def_only, spaces): + def RoutineHeaderWrapperCL(self, flavour, def_only, spaces): template = "<"+flavour.template+">" if self.NoScalars() and not def_only else "" indent = " "*(spaces + self.Length() + len(template)) result = "" @@ -366,9 +442,16 @@ class Routine(): result += flavour.name result += ">\n" result += "clblasStatus clblasX"+self.name+template+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)]) + result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)]) result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues" result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)" return result + # As above, but now for the CBLAS wrapper + def RoutineHeaderWrapperC(self, flavour, def_only, spaces): + indent = " "*(spaces + self.Length()) + result = "void cblasX"+self.name+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")" + return result + # ================================================================================================== diff --git a/src/clblast.cc b/src/clblast.cc index fc50ffae..75893ee9 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -93,7 +93,7 @@ template StatusCode Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, - cl_mem, const size_t, + const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; @@ -101,13 +101,13 @@ StatusCode Rotmg(cl_mem, const size_t, template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, - cl_mem, const size_t, + const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, - cl_mem, const size_t, + const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 6d10c686..23e97bd5 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -55,7 +55,7 @@ StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { auto status = clblast::Rotmg(sd1_buffer, sd1_offset, @@ -69,7 +69,7 @@ StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { auto status = clblast::Rotmg(sd1_buffer, sd1_offset, diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index 1329b2c5..cc9a5adb 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -79,24 +79,6 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st // Iterates over all the to-be-tested combinations of arguments for (auto &args: test_vector) { - // Runs the reference clBLAS code - auto x_vec1 = Buffer(context_, args.x_size); - auto y_vec1 = Buffer(context_, args.y_size); - auto a_mat1 = Buffer(context_, args.a_size); - auto b_mat1 = Buffer(context_, args.b_size); - auto c_mat1 = Buffer(context_, args.c_size); - auto ap_mat1 = Buffer(context_, args.ap_size); - auto scalar1 = Buffer(context_, args.scalar_size); - x_vec1.Write(queue_, args.x_size, x_source_); - y_vec1.Write(queue_, args.y_size, y_source_); - a_mat1.Write(queue_, args.a_size, a_source_); - b_mat1.Write(queue_, args.b_size, b_source_); - c_mat1.Write(queue_, args.c_size, c_source_); - ap_mat1.Write(queue_, args.ap_size, ap_source_); - scalar1.Write(queue_, args.scalar_size, scalar_source_); - auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; - auto status1 = run_reference_(args, buffers1, queue_); - // Runs the CLBlast code auto x_vec2 = Buffer(context_, args.x_size); auto y_vec2 = Buffer(context_, args.y_size); @@ -115,6 +97,33 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2}; auto status2 = run_routine_(args, buffers2, queue_); + #ifndef CLBLAST_REF_CLBLAS + // Don't continue with CBLAS if there are incorrect parameters + if (status2 != StatusCode::kSuccess) { + // TODO: Mark this as a skipped test instead of a succesfull test + TestErrorCodes(status2, status2, args); + continue; + } + #endif + + // Runs the reference BLAS code + auto x_vec1 = Buffer(context_, args.x_size); + auto y_vec1 = Buffer(context_, args.y_size); + auto a_mat1 = Buffer(context_, args.a_size); + auto b_mat1 = Buffer(context_, args.b_size); + auto c_mat1 = Buffer(context_, args.c_size); + auto ap_mat1 = Buffer(context_, args.ap_size); + auto scalar1 = Buffer(context_, args.scalar_size); + x_vec1.Write(queue_, args.x_size, x_source_); + y_vec1.Write(queue_, args.y_size, y_source_); + a_mat1.Write(queue_, args.a_size, a_source_); + b_mat1.Write(queue_, args.b_size, b_source_); + c_mat1.Write(queue_, args.c_size, c_source_); + ap_mat1.Write(queue_, args.ap_size, ap_source_); + scalar1.Write(queue_, args.scalar_size, scalar_source_); + auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1}; + auto status1 = run_reference_(args, buffers1, queue_); + // Tests for equality of the two status codes if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) { TestErrorCodes(status1, status2, args); diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h index 7c9032bd..8181aaf6 100644 --- a/test/correctness/testblas.h +++ b/test/correctness/testblas.h @@ -68,7 +68,7 @@ class TestBlas: public Tester { static const std::vector kTransposes; // Data-type dependent, see .cc-file // Shorthand for the routine-specific functions passed to the tester - using Routine = std::function&, const Buffers&, Queue&)>; + using Routine = std::function&, Buffers&, Queue&)>; using ResultGet = std::function(const Arguments&, Buffers&, Queue&)>; using ResultIndex = std::function&, const size_t, const size_t)>; using ResultIterator = std::function&)>; @@ -76,8 +76,9 @@ class TestBlas: public Tester { // Constructor, initializes the base class tester and input data TestBlas(int argc, char *argv[], const bool silent, const std::string &name, const std::vector &options, - const Routine run_routine, const Routine run_reference, const ResultGet get_result, - const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2); + const Routine run_routine, const Routine run_reference, + const ResultGet get_result, const ResultIndex get_index, + const ResultIterator get_id1, const ResultIterator get_id2); // The test functions, taking no inputs void TestRegular(std::vector> &test_vector, const std::string &name); @@ -110,9 +111,17 @@ class TestBlas: public Tester { template void RunTests(int argc, char *argv[], const bool silent, const std::string &name) { + // Sets the reference to test against + #ifdef CLBLAST_REF_CLBLAS + const auto reference_routine = C::RunReference1; // clBLAS when available + #else + const auto reference_routine = C::RunReference2; // otherwise CBLAS + #endif + // Creates a tester auto options = C::GetOptions(); - TestBlas tester{argc, argv, silent, name, options, C::RunRoutine, C::RunReference, + TestBlas tester{argc, argv, silent, name, options, + C::RunRoutine, reference_routine, C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2}; // This variable holds the arguments relevant for this routine @@ -250,23 +259,25 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name } // Creates the arguments vector for the invalid-buffer tests - auto invalid_test_vector = std::vector>{}; - auto i_args = args; - i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize; - i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize; - for (auto &x_size: x_sizes) { i_args.x_size = x_size; - for (auto &y_size: y_sizes) { i_args.y_size = y_size; - for (auto &a_size: a_sizes) { i_args.a_size = a_size; - for (auto &b_size: b_sizes) { i_args.b_size = b_size; - for (auto &c_size: c_sizes) { i_args.c_size = c_size; - for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size; - invalid_test_vector.push_back(i_args); + #ifdef CLBLAST_REF_CLBLAS + auto invalid_test_vector = std::vector>{}; + auto i_args = args; + i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize; + i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize; + for (auto &x_size: x_sizes) { i_args.x_size = x_size; + for (auto &y_size: y_sizes) { i_args.y_size = y_size; + for (auto &a_size: a_sizes) { i_args.a_size = a_size; + for (auto &b_size: b_sizes) { i_args.b_size = b_size; + for (auto &c_size: c_sizes) { i_args.c_size = c_size; + for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size; + invalid_test_vector.push_back(i_args); + } } } } } } - } + #endif // Sets the name of this test-case auto names = std::vector{}; @@ -287,7 +298,9 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name // Runs the tests tester.TestRegular(regular_test_vector, case_name); - tester.TestInvalid(invalid_test_vector, case_name); + #ifdef CLBLAST_REF_CLBLAS + tester.TestInvalid(invalid_test_vector, case_name); + #endif } } } diff --git a/test/correctness/tester.cc b/test/correctness/tester.cc index 8169f700..872a131a 100644 --- a/test/correctness/tester.cc +++ b/test/correctness/tester.cc @@ -69,10 +69,12 @@ Tester::Tester(int argc, char *argv[], const bool silent, kUnsupportedPrecision.c_str()); // Initializes clBLAS - auto status = clblasSetup(); - if (status != CL_SUCCESS) { - throw std::runtime_error("clBLAS setup error: "+ToString(static_cast(status))); - } + #ifdef CLBLAST_REF_CLBLAS + auto status = clblasSetup(); + if (status != CL_SUCCESS) { + throw std::runtime_error("clBLAS setup error: "+ToString(static_cast(status))); + } + #endif } // Destructor prints the summary of the test cases and cleans-up the clBLAS library @@ -87,7 +89,11 @@ Tester::~Tester() { fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str()); } fprintf(stdout, "\n"); - clblasTeardown(); + + // Cleans-up clBLAS + #ifdef CLBLAST_REF_CLBLAS + clblasTeardown(); + #endif } // ================================================================================================= diff --git a/test/correctness/tester.h b/test/correctness/tester.h index db714f3d..d489f829 100644 --- a/test/correctness/tester.h +++ b/test/correctness/tester.h @@ -23,7 +23,9 @@ #include // The libraries -#include +#ifdef CLBLAST_REF_CLBLAS + #include +#endif #include "clblast.h" #include "internal/utilities.h" @@ -92,7 +94,7 @@ class Tester { Queue queue_; // Whether or not to run the full test-suite or just a smoke test - bool full_test_; + const bool full_test_; // Retrieves the offset values to test with const std::vector GetOffsets() const; diff --git a/test/performance/client.cc b/test/performance/client.cc index 17f54231..56ab8c8d 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -24,11 +24,13 @@ namespace clblast { // Constructor template -Client::Client(const Routine run_routine, const Routine run_reference, +Client::Client(const Routine run_routine, + const Routine run_reference1, const Routine run_reference2, const std::vector &options, const GetMetric get_flops, const GetMetric get_bytes): run_routine_(run_routine), - run_reference_(run_reference), + run_reference1_(run_reference1), + run_reference2_(run_reference2), options_(options), get_flops_(get_flops), get_bytes_(get_bytes) { @@ -90,7 +92,16 @@ Arguments Client::ParseArguments(int argc, char *argv[], const GetMetric args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0}); args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0}); args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle); - args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1); + #ifdef CLBLAST_REF_CLBLAS + args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1); + #else + args.compare_clblas = 0; + #endif + #ifdef CLBLAST_REF_CBLAS + args.compare_cblas = GetArgument(argc, argv, help, kArgComparecblas, 1); + #else + args.compare_cblas = 0; + #endif args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1}); args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0}); args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10}); @@ -120,7 +131,9 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) auto device = Device(platform, args.device_id); auto context = Context(device); auto queue = Queue(context, device); - if (args.compare_clblas) { clblasSetup(); } + #ifdef CLBLAST_REF_CLBLAS + if (args.compare_clblas) { clblasSetup(); } + #endif // Iterates over all "num_step" values jumping by "step" each time auto s = size_t{0}; @@ -167,9 +180,13 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); timings.push_back(std::pair("CLBlast", ms_clblast)); if (args.compare_clblas) { - auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS"); + auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS"); timings.push_back(std::pair("clBLAS", ms_clblas)); } + if (args.compare_cblas) { + auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS"); + timings.push_back(std::pair("CPU BLAS", ms_cblas)); + } // Prints the performance of the tested libraries PrintTableRow(args, timings); @@ -186,7 +203,9 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) } // Cleans-up and returns - if (args.compare_clblas) { clblasTeardown(); } + #ifdef CLBLAST_REF_CLBLAS + if (args.compare_clblas) { clblasTeardown(); } + #endif } // ================================================================================================= @@ -196,14 +215,17 @@ void Client::PerformanceTest(Arguments &args, const SetMetric set_sizes) // value found in the vector of timing results. The return value is in milliseconds. template double Client::TimedExecution(const size_t num_runs, const Arguments &args, - const Buffers &buffers, Queue &queue, + Buffers &buffers, Queue &queue, Routine run_blas, const std::string &library_name) { auto timings = std::vector(num_runs); for (auto &timing: timings) { auto start_time = std::chrono::steady_clock::now(); // Executes the main computation - auto status = run_blas(args, buffers, queue); + auto status = StatusCode::kSuccess; + try { + status = run_blas(args, buffers, queue); + } catch (...) { status = static_cast(kUnknownError); } if (status != StatusCode::kSuccess) { throw std::runtime_error(library_name+" error: "+ToString(static_cast(status))); } @@ -226,6 +248,7 @@ void Client::PrintTableHeader(const Arguments& args) { for (auto i=size_t{0}; i"); if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); } + if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); } fprintf(stdout, " |\n"); } @@ -233,6 +256,7 @@ void Client::PrintTableHeader(const Arguments& args) { for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); } fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1"); if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); } + if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); } fprintf(stdout, "\n"); } diff --git a/test/performance/client.h b/test/performance/client.h index 5805b8a5..8d0597d7 100644 --- a/test/performance/client.h +++ b/test/performance/client.h @@ -26,7 +26,9 @@ #include // The libraries to test -#include +#ifdef CLBLAST_REF_CLBLAS + #include +#endif #include "clblast.h" #include "internal/utilities.h" @@ -40,12 +42,12 @@ class Client { public: // Shorthand for the routine-specific functions passed to the tester - using Routine = std::function&, const Buffers&, Queue&)>; + using Routine = std::function&, Buffers&, Queue&)>; using SetMetric = std::function&)>; using GetMetric = std::function&)>; // The constructor - Client(const Routine run_routine, const Routine run_reference, + Client(const Routine run_routine, const Routine run_reference1, const Routine run_reference2, const std::vector &options, const GetMetric get_flops, const GetMetric get_bytes); @@ -61,7 +63,7 @@ class Client { private: // Runs a function a given number of times and returns the execution time of the shortest instance - double TimedExecution(const size_t num_runs, const Arguments &args, const Buffers &buffers, + double TimedExecution(const size_t num_runs, const Arguments &args, Buffers &buffers, Queue &queue, Routine run_blas, const std::string &library_name); // Prints the header of a performance-data table @@ -73,7 +75,8 @@ class Client { // The routine-specific functions passed to the tester const Routine run_routine_; - const Routine run_reference_; + const Routine run_reference1_; + const Routine run_reference2_; const std::vector options_; const GetMetric get_flops_; const GetMetric get_bytes_; @@ -81,13 +84,31 @@ class Client { // ================================================================================================= +// Bogus reference function, in case a comparison library is not available +template +static StatusCode ReferenceNotAvailable(const Arguments &, Buffers &, Queue &) { + return StatusCode::kNotImplemented; +} + // The interface to the performance client. This is a separate function in the header such that it // is automatically compiled for each routine, templated by the parameter "C". template void RunClient(int argc, char *argv[]) { + // Sets the reference to test against + #ifdef CLBLAST_REF_CLBLAS + const auto reference1 = C::RunReference1; // clBLAS when available + #else + const auto reference1 = ReferenceNotAvailable; + #endif + #ifdef CLBLAST_REF_CBLAS + const auto reference2 = C::RunReference2; // CBLAS when available + #else + const auto reference2 = ReferenceNotAvailable; + #endif + // Creates a new client - auto client = Client(C::RunRoutine, C::RunReference, C::GetOptions(), + auto client = Client(C::RunRoutine, reference1, reference2, C::GetOptions(), C::GetFlops, C::GetBytes); // Simple command line argument parser with defaults diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h index 50480f46..8f72f570 100644 --- a/test/routines/level1/xaxpy.h +++ b/test/routines/level1/xaxpy.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXaxpy { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Axpy(args.n, args.alpha, @@ -77,16 +82,33 @@ class TestXaxpy { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXaxpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXaxpy(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXaxpy(args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xcopy.h b/test/routines/level1/xcopy.h index 8d324d88..0527ca6a 100644 --- a/test/routines/level1/xcopy.h +++ b/test/routines/level1/xcopy.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -64,7 +69,7 @@ class TestXcopy { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Copy(args.n, @@ -76,16 +81,33 @@ class TestXcopy { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXcopy(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXcopy(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXcopy(args.n, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.h index 04669f52..d1c34c0f 100644 --- a/test/routines/level1/xdot.h +++ b/test/routines/level1/xdot.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -68,7 +73,7 @@ class TestXdot { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dot(args.n, @@ -81,17 +86,37 @@ class TestXdot { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXdot(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdot(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXdot(args.n, + scalar_cpu, args.dot_offset, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.h index e5b42ef4..a2742cb0 100644 --- a/test/routines/level1/xdotc.h +++ b/test/routines/level1/xdotc.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -68,7 +73,7 @@ class TestXdotc { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotc(args.n, @@ -81,17 +86,37 @@ class TestXdotc { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXdotc(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdotc(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXdotc(args.n, + scalar_cpu, args.dot_offset, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.h index 6430148c..06ce979e 100644 --- a/test/routines/level1/xdotu.h +++ b/test/routines/level1/xdotu.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -68,7 +73,7 @@ class TestXdotu { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Dotu(args.n, @@ -81,17 +86,37 @@ class TestXdotu { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXdotu(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXdotu(args.n, + buffers.scalar(), args.dot_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXdotu(args.n, + scalar_cpu, args.dot_offset, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xnrm2.h b/test/routines/level1/xnrm2.h index e3f77ee4..d8a0de4e 100644 --- a/test/routines/level1/xnrm2.h +++ b/test/routines/level1/xnrm2.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -64,7 +69,7 @@ class TestXnrm2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Nrm2(args.n, @@ -76,16 +81,33 @@ class TestXnrm2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXnrm2(args.n, - buffers.scalar(), args.nrm2_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXnrm2(args.n, + buffers.scalar(), args.nrm2_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector scalar_cpu(args.scalar_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.scalar.Read(queue, args.scalar_size, scalar_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXnrm2(args.n, + scalar_cpu, args.nrm2_offset, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.scalar.Write(queue, args.scalar_size, scalar_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xscal.h b/test/routines/level1/xscal.h index d990afcc..35855dbd 100644 --- a/test/routines/level1/xscal.h +++ b/test/routines/level1/xscal.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -61,7 +66,7 @@ class TestXscal { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Scal(args.n, args.alpha, @@ -72,15 +77,29 @@ class TestXscal { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXscal(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXscal(args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXscal(args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level1/xswap.h b/test/routines/level1/xswap.h index 2096a2c3..ae69d3be 100644 --- a/test/routines/level1/xswap.h +++ b/test/routines/level1/xswap.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -64,7 +69,7 @@ class TestXswap { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Swap(args.n, @@ -76,16 +81,34 @@ class TestXswap { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXswap(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXswap(args.n, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXswap(args.n, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h index 0e238804..b875075d 100644 --- a/test/routines/level2/xgbmv.h +++ b/test/routines/level2/xgbmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXgbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gbmv(args.layout, args.a_transpose, @@ -90,19 +95,41 @@ class TestXgbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgbmv(static_cast(args.layout), - static_cast(args.a_transpose), - args.m, args.n, args.kl, args.ku, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgbmv(static_cast(args.layout), + static_cast(args.a_transpose), + args.m, args.n, args.kl, args.ku, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.a_transpose), + args.m, args.n, args.kl, args.ku, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h index 2924d498..a70ccd34 100644 --- a/test/routines/level2/xgemv.h +++ b/test/routines/level2/xgemv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXgemv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemv(args.layout, args.a_transpose, @@ -90,19 +95,41 @@ class TestXgemv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgemv(static_cast(args.layout), - static_cast(args.a_transpose), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgemv(static_cast(args.layout), + static_cast(args.a_transpose), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgemv(convertToCBLAS(args.layout), + convertToCBLAS(args.a_transpose), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xger.h b/test/routines/level2/xger.h index 98296e92..32c2a505 100644 --- a/test/routines/level2/xger.h +++ b/test/routines/level2/xger.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -72,7 +77,7 @@ class TestXger { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Ger(args.layout, @@ -86,18 +91,39 @@ class TestXger { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXger(static_cast(args.layout), - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXger(static_cast(args.layout), + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXger(convertToCBLAS(args.layout), + args.m, args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xgerc.h b/test/routines/level2/xgerc.h index 77258d92..4b6954f6 100644 --- a/test/routines/level2/xgerc.h +++ b/test/routines/level2/xgerc.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -72,7 +77,7 @@ class TestXgerc { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gerc(args.layout, @@ -86,18 +91,39 @@ class TestXgerc { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgerc(static_cast(args.layout), - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgerc(static_cast(args.layout), + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgerc(convertToCBLAS(args.layout), + args.m, args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xgeru.h b/test/routines/level2/xgeru.h index e5f5f235..295e69e5 100644 --- a/test/routines/level2/xgeru.h +++ b/test/routines/level2/xgeru.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -72,7 +77,7 @@ class TestXgeru { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Geru(args.layout, @@ -86,18 +91,39 @@ class TestXgeru { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgeru(static_cast(args.layout), - args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgeru(static_cast(args.layout), + args.m, args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXgeru(convertToCBLAS(args.layout), + args.m, args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h index 34e1502f..e0bdc4da 100644 --- a/test/routines/level2/xhbmv.h +++ b/test/routines/level2/xhbmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hbmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhbmv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhbmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.kl, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xhemv.h b/test/routines/level2/xhemv.h index 80e22157..fa242961 100644 --- a/test/routines/level2/xhemv.h +++ b/test/routines/level2/xhemv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhemv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhemv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhemv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhemv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhemv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xher.h b/test/routines/level2/xher.h index 53c4200f..7d0e8cc3 100644 --- a/test/routines/level2/xher.h +++ b/test/routines/level2/xher.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXher { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Her(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXher { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXher(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXher(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXher(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xher2.h b/test/routines/level2/xher2.h index c12ff827..445bba74 100644 --- a/test/routines/level2/xher2.h +++ b/test/routines/level2/xher2.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXher2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Her2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXher2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXher2(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXher2(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXher2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h index 8fd85b62..406e564f 100644 --- a/test/routines/level2/xhpmv.h +++ b/test/routines/level2/xhpmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhpmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhpmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhpmv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhpmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + ap_mat_cpu, args.ap_offset, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xhpr.h b/test/routines/level2/xhpr.h index 03599ddc..6f56d3f3 100644 --- a/test/routines/level2/xhpr.h +++ b/test/routines/level2/xhpr.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXhpr { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXhpr { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhpr(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpr(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXhpr(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xhpr2.h b/test/routines/level2/xhpr2.h index 68fbc76d..43889cb9 100644 --- a/test/routines/level2/xhpr2.h +++ b/test/routines/level2/xhpr2.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXhpr2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hpr2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXhpr2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhpr2(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhpr2(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXhpr2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h index 5bc17e49..9a5c5140 100644 --- a/test/routines/level2/xsbmv.h +++ b/test/routines/level2/xsbmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXsbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Sbmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXsbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsbmv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsbmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.kl, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXsbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.kl, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h index e335da42..913af0cd 100644 --- a/test/routines/level2/xspmv.h +++ b/test/routines/level2/xspmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXspmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spmv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXspmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXspmv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspmv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXspmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + ap_mat_cpu, args.ap_offset, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xspr.h b/test/routines/level2/xspr.h index 819b1ca8..bab5c541 100644 --- a/test/routines/level2/xspr.h +++ b/test/routines/level2/xspr.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXspr { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXspr { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXspr(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspr(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXspr(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xspr2.h b/test/routines/level2/xspr2.h index 43d66c9e..41a04cc0 100644 --- a/test/routines/level2/xspr2.h +++ b/test/routines/level2/xspr2.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXspr2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Spr2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXspr2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXspr2(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXspr2(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat(), args.ap_offset, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXspr2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + ap_mat_cpu, args.ap_offset); + buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xsymv.h b/test/routines/level2/xsymv.h index 13473a3e..0576bc1f 100644 --- a/test/routines/level2/xsymv.h +++ b/test/routines/level2/xsymv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXsymv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symv(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXsymv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsymv(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsymv(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, args.beta, + buffers.y_vec(), args.y_offset, args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXsymv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc, args.beta, + y_vec_cpu, args.y_offset, args.y_inc); + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xsyr.h b/test/routines/level2/xsyr.h index 66b75c0c..062eea5a 100644 --- a/test/routines/level2/xsyr.h +++ b/test/routines/level2/xsyr.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -66,7 +71,7 @@ class TestXsyr { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr(args.layout, args.triangle, @@ -79,18 +84,37 @@ class TestXsyr { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyr(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXsyr(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xsyr2.h b/test/routines/level2/xsyr2.h index 32497a61..50bc3cea 100644 --- a/test/routines/level2/xsyr2.h +++ b/test/routines/level2/xsyr2.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -70,7 +75,7 @@ class TestXsyr2 { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2(args.layout, args.triangle, @@ -84,19 +89,41 @@ class TestXsyr2 { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyr2(static_cast(args.layout), - static_cast(args.triangle), - args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2(static_cast(args.layout), + static_cast(args.triangle), + args.n, args.alpha, + buffers.x_vec(), args.x_offset, args.x_inc, + buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat(), args.a_offset, args.a_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + std::vector y_vec_cpu(args.y_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + cblasXsyr2(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + args.n, args.alpha, + x_vec_cpu, args.x_offset, args.x_inc, + y_vec_cpu, args.y_offset, args.y_inc, + a_mat_cpu, args.a_offset, args.a_ld); + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h index dbdddb65..600b4131 100644 --- a/test/routines/level2/xtbmv.h +++ b/test/routines/level2/xtbmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXtbmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Tbmv(args.layout, args.triangle, args.a_transpose, args.diagonal, @@ -78,20 +83,41 @@ class TestXtbmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtbmv(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), - args.n, args.kl, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtbmv(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.n, args.kl, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXtbmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.n, args.kl, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h index 4425765e..fc0cf393 100644 --- a/test/routines/level2/xtpmv.h +++ b/test/routines/level2/xtpmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXtpmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Tpmv(args.layout, args.triangle, args.a_transpose, args.diagonal, @@ -78,20 +83,41 @@ class TestXtpmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtpmv(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), - args.n, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtpmv(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.n, + buffers.ap_mat(), args.ap_offset, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector ap_mat_cpu(args.ap_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXtpmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.n, + ap_mat_cpu, args.ap_offset, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h index 1c0c6fd8..fec72124 100644 --- a/test/routines/level2/xtrmv.h +++ b/test/routines/level2/xtrmv.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -65,7 +70,7 @@ class TestXtrmv { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmv(args.layout, args.triangle, args.a_transpose, args.diagonal, @@ -78,20 +83,41 @@ class TestXtrmv { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtrmv(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), - args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrmv(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXtrmv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.n, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h index 695b58b7..49a92936 100644 --- a/test/routines/level3/xgemm.h +++ b/test/routines/level3/xgemm.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -78,7 +83,7 @@ class TestXgemm { static Transposes GetBTransposes(const Transposes &all) { return all; } // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Gemm(args.layout, args.a_transpose, args.b_transpose, @@ -92,20 +97,43 @@ class TestXgemm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXgemm(static_cast(args.layout), - static_cast(args.a_transpose), - static_cast(args.b_transpose), - args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXgemm(static_cast(args.layout), + static_cast(args.a_transpose), + static_cast(args.b_transpose), + args.m, args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXgemm(convertToCBLAS(args.layout), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.b_transpose), + args.m, args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h index 7b7134e5..40538417 100644 --- a/test/routines/level3/xhemm.h +++ b/test/routines/level3/xhemm.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -78,7 +83,7 @@ class TestXhemm { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Hemm(args.layout, args.side, args.triangle, @@ -92,20 +97,43 @@ class TestXhemm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXhemm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXhemm(static_cast(args.layout), + static_cast(args.side), + static_cast(args.triangle), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXhemm(convertToCBLAS(args.layout), + convertToCBLAS(args.side), + convertToCBLAS(args.triangle), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h index a7fbfcbe..1ea2ad36 100644 --- a/test/routines/level3/xher2k.h +++ b/test/routines/level3/xher2k.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXher2k { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto alpha2 = T{args.alpha, args.alpha}; @@ -91,21 +96,45 @@ class TestXher2k { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto alpha2 = T{args.alpha, args.alpha}; - auto status = clblasXher2k(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto alpha2 = T{args.alpha, args.alpha}; + auto status = clblasXher2k(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, alpha2, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + auto alpha2 = T{args.alpha, args.alpha}; + cblasXher2k(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, alpha2, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h index f097672f..75a7c405 100644 --- a/test/routines/level3/xherk.h +++ b/test/routines/level3/xherk.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -69,7 +74,7 @@ class TestXherk { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Herk(args.layout, args.triangle, args.a_transpose, @@ -82,19 +87,39 @@ class TestXherk { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXherk(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXherk(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXherk(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h index 03cf5de9..f867c238 100644 --- a/test/routines/level3/xsymm.h +++ b/test/routines/level3/xsymm.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -78,7 +83,7 @@ class TestXsymm { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Symm(args.layout, args.side, args.triangle, @@ -92,20 +97,43 @@ class TestXsymm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsymm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsymm(static_cast(args.layout), + static_cast(args.side), + static_cast(args.triangle), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXsymm(convertToCBLAS(args.layout), + convertToCBLAS(args.side), + convertToCBLAS(args.triangle), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h index 89e77f83..be4e1851 100644 --- a/test/routines/level3/xsyr2k.h +++ b/test/routines/level3/xsyr2k.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -76,7 +81,7 @@ class TestXsyr2k { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syr2k(args.layout, args.triangle, args.a_transpose, @@ -90,20 +95,43 @@ class TestXsyr2k { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyr2k(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2k(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXsyr2k(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h index 8dacb5b3..7675e2aa 100644 --- a/test/routines/level3/xsyrk.h +++ b/test/routines/level3/xsyrk.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -69,7 +74,7 @@ class TestXsyrk { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Syrk(args.layout, args.triangle, args.a_transpose, @@ -82,19 +87,39 @@ class TestXsyrk { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXsyrk(static_cast(args.layout), - static_cast(args.triangle), - static_cast(args.a_transpose), - args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyrk(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, args.beta, + buffers.c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector c_mat_cpu(args.c_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.c_mat.Read(queue, args.c_size, c_mat_cpu); + cblasXsyrk(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + args.n, args.k, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, args.beta, + c_mat_cpu, args.c_offset, args.c_ld); + buffers.c_mat.Write(queue, args.c_size, c_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h index 152cdf58..a085cb15 100644 --- a/test/routines/level3/xtrmm.h +++ b/test/routines/level3/xtrmm.h @@ -19,7 +19,12 @@ #include #include -#include "wrapper_clblas.h" +#ifdef CLBLAST_REF_CLBLAS + #include "wrapper_clblas.h" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "wrapper_cblas.h" +#endif namespace clblast { // ================================================================================================= @@ -69,7 +74,7 @@ class TestXtrmm { static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine // Describes how to run the CLBlast routine - static StatusCode RunRoutine(const Arguments &args, const Buffers &buffers, Queue &queue) { + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { auto queue_plain = queue(); auto event = cl_event{}; auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, @@ -82,21 +87,43 @@ class TestXtrmm { } // Describes how to run the clBLAS routine (for correctness/performance comparison) - static StatusCode RunReference(const Arguments &args, const Buffers &buffers, Queue &queue) { - auto queue_plain = queue(); - auto event = cl_event{}; - auto status = clblasXtrmm(static_cast(args.layout), - static_cast(args.side), - static_cast(args.triangle), - static_cast(args.a_transpose), - static_cast(args.diagonal), - args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, - 1, &queue_plain, 0, nullptr, &event); - clWaitForEvents(1, &event); - return static_cast(status); - } + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrmm(static_cast(args.layout), + static_cast(args.side), + static_cast(args.triangle), + static_cast(args.a_transpose), + static_cast(args.diagonal), + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + cblasXtrmm(convertToCBLAS(args.layout), + convertToCBLAS(args.side), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld); + buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + return StatusCode::kSuccess; + } + #endif // Describes how to download the results of the computation (more importantly: which buffer) static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h new file mode 100644 index 00000000..dec272b0 --- /dev/null +++ b/test/wrapper_cblas.h @@ -0,0 +1,1674 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a wrapper around a CPU BLAS library, such that its routines can be called +// in a similar way as the CLBlast routines: using alpha and beta to determine the precision. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_WRAPPER_CBLAS_H_ +#define CLBLAST_TEST_WRAPPER_CBLAS_H_ + +#include + +#include "internal/utilities.h" + +namespace clblast { + +// Conversions from CLBlast types +CBLAS_ORDER convertToCBLAS(const Layout v) { return (v == Layout::kRowMajor) ? CblasRowMajor : CblasColMajor; } +CBLAS_TRANSPOSE convertToCBLAS(const Transpose v) { return (v == Transpose::kNo) ? CblasNoTrans : (v == Transpose::kYes) ? CblasTrans : CblasConjTrans; } +CBLAS_UPLO convertToCBLAS(const Triangle v) { return (v == Triangle::kUpper) ? CblasUpper : CblasLower; } +CBLAS_DIAG convertToCBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? CblasUnit : CblasNonUnit; } +CBLAS_SIDE convertToCBLAS(const Side v) { return (v == Side::kLeft) ? CblasLeft : CblasRight; } + +// OpenBLAS is not fully Netlib CBLAS compatible +#ifdef OPENBLAS_VERSION + using return_pointer_float = openblas_complex_float*; + using return_pointer_double = openblas_complex_double*; +#else + using return_pointer_float = void*; + using return_pointer_double = void*; +#endif + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Forwards the Netlib BLAS calls for SROTG/DROTG +void cblasXrotg(std::vector& sa_buffer, const size_t sa_offset, + std::vector& sb_buffer, const size_t sb_offset, + std::vector& sc_buffer, const size_t sc_offset, + std::vector& ss_buffer, const size_t ss_offset) { + cblas_srotg(&sa_buffer[sa_offset], + &sb_buffer[sb_offset], + &sc_buffer[sc_offset], + &ss_buffer[ss_offset]); +} +void cblasXrotg(std::vector& sa_buffer, const size_t sa_offset, + std::vector& sb_buffer, const size_t sb_offset, + std::vector& sc_buffer, const size_t sc_offset, + std::vector& ss_buffer, const size_t ss_offset) { + cblas_drotg(&sa_buffer[sa_offset], + &sb_buffer[sb_offset], + &sc_buffer[sc_offset], + &ss_buffer[ss_offset]); +} + +// Forwards the Netlib BLAS calls for SROTMG/DROTMG +void cblasXrotmg(std::vector& sd1_buffer, const size_t sd1_offset, + std::vector& sd2_buffer, const size_t sd2_offset, + std::vector& sx1_buffer, const size_t sx1_offset, + const std::vector& sy1_buffer, const size_t sy1_offset, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_srotmg(&sd1_buffer[sd1_offset], + &sd2_buffer[sd2_offset], + &sx1_buffer[sx1_offset], + sy1_buffer[sy1_offset], + &sparam_buffer[sparam_offset]); +} +void cblasXrotmg(std::vector& sd1_buffer, const size_t sd1_offset, + std::vector& sd2_buffer, const size_t sd2_offset, + std::vector& sx1_buffer, const size_t sx1_offset, + const std::vector& sy1_buffer, const size_t sy1_offset, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_drotmg(&sd1_buffer[sd1_offset], + &sd2_buffer[sd2_offset], + &sx1_buffer[sx1_offset], + sy1_buffer[sy1_offset], + &sparam_buffer[sparam_offset]); +} + +// Forwards the Netlib BLAS calls for SROT/DROT +void cblasXrot(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + const float cos, + const float sin) { + cblas_srot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + cos, + sin); +} +void cblasXrot(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + const double cos, + const double sin) { + cblas_drot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + cos, + sin); +} + +// Forwards the Netlib BLAS calls for SROTM/DROTM +void cblasXrotm(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_srotm(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &sparam_buffer[sparam_offset]); +} +void cblasXrotm(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_drotm(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &sparam_buffer[sparam_offset]); +} + +// Forwards the Netlib BLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sswap(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dswap(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_cswap(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zswap(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL +void cblasXscal(const size_t n, + const float alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_sscal(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXscal(const size_t n, + const double alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dscal(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXscal(const size_t n, + const float2 alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cscal(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXscal(const size_t n, + const double2 alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zscal(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_scopy(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dcopy(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_ccopy(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zcopy(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY +void cblasXaxpy(const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_saxpy(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXaxpy(const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_daxpy(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXaxpy(const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_caxpy(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXaxpy(const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zaxpy(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SDOT/DDOT +void cblasXdot(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + dot_buffer[dot_offset] = cblas_sdot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXdot(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + dot_buffer[dot_offset] = cblas_ddot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CDOTU/ZDOTU +void cblasXdotu(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_cdotu_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} +void cblasXdotu(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zdotu_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} + +// Forwards the Netlib BLAS calls for CDOTC/ZDOTC +void cblasXdotc(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_cdotc_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} +void cblasXdotc(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zdotc_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} + +// Forwards the Netlib BLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_snrm2(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_dnrm2(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_scnrm2(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_dznrm2(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// Forwards the Netlib BLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sgemv(layout, a_transpose, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dgemv(layout, a_transpose, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_cgemv(layout, a_transpose, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zgemv(layout, a_transpose, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_cgbmv(layout, a_transpose, + m, n, kl, ku, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zgbmv(layout, a_transpose, + m, n, kl, ku, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CHEMV/ZHEMV +void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chemv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhemv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CHBMV/ZHBMV +void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chbmv(layout, triangle, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhbmv(layout, triangle, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CHPMV/ZHPMV +void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chpmv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhpmv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSYMV/DSYMV +void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_ssymv(layout, triangle, + n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dsymv(layout, triangle, + n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSBMV/DSBMV +void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_ssbmv(layout, triangle, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dsbmv(layout, triangle, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSPMV/DSPMV +void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sspmv(layout, triangle, + n, + alpha, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dspmv(layout, triangle, + n, + alpha, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for STRMV/DTRMV/CTRMV/ZTRMV +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_strmv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtrmv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctrmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztrmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STBMV/DTBMV/CTBMV/ZTBMV +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stbmv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtbmv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctbmv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztbmv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STPMV/DTPMV/CTPMV/ZTPMV +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stpmv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtpmv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctpmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztpmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STRSV/DTRSV/CTRSV/ZTRSV +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_strsv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtrsv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctrsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztrsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STBSV/DTBSV/CTBSV/ZTBSV +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stbsv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtbsv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctbsv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztbsv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STPSV/DTPSV/CTPSV/ZTPSV +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stpsv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtpsv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctpsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztpsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for SGER/DGER +void cblasXger(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_sger(layout, + m, n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} +void cblasXger(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_dger(layout, + m, n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} + +// Forwards the Netlib BLAS calls for CGERU/ZGERU +void cblasXgeru(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cgeru(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXgeru(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zgeru(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CGERC/ZGERC +void cblasXgerc(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cgerc(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXgerc(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zgerc(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CHER/ZHER +void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_cher(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_zher(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CHPR/ZHPR +void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_chpr(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} +void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_zhpr(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} + +// Forwards the Netlib BLAS calls for CHER2/ZHER2 +void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cher2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zher2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CHPR2/ZHPR2 +void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_chpr2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} +void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zhpr2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} + +// Forwards the Netlib BLAS calls for SSYR/DSYR +void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_ssyr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &a_buffer[a_offset], a_ld); +} +void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_dsyr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &a_buffer[a_offset], a_ld); +} + +// Forwards the Netlib BLAS calls for SSPR/DSPR +void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_sspr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &ap_buffer[ap_offset]); +} +void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_dspr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &ap_buffer[ap_offset]); +} + +// Forwards the Netlib BLAS calls for SSYR2/DSYR2 +void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_ssyr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} +void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_dsyr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} + +// Forwards the Netlib BLAS calls for SSPR2/DSPR2 +void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_sspr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &ap_buffer[ap_offset]); +} +void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_dspr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &ap_buffer[ap_offset]); +} + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// Forwards the Netlib BLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_sgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_cgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_ssymm(layout, side, triangle, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dsymm(layout, side, triangle, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_csymm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zsymm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for CHEMM/ZHEMM +void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chemm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhemm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_ssyrk(layout, triangle, a_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dsyrk(layout, triangle, a_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_csyrk(layout, triangle, a_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zsyrk(layout, triangle, a_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for CHERK/ZHERK +void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_cherk(layout, triangle, a_transpose, + n, k, + alpha, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_zherk(layout, triangle, a_transpose, + n, k, + alpha, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_ssyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dsyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_csyr2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zsyr2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for CHER2K/ZHER2K +void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cher2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zher2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for STRMM/DTRMM/CTRMM/ZTRMM +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_strmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_dtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ctrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ztrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} + +// Forwards the Netlib BLAS calls for STRSM/DTRSM/CTRSM/ZTRSM +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_strsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_dtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ctrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ztrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_WRAPPER_CBLAS_H_ +#endif diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index fb6e83aa..89b708b8 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -65,7 +65,7 @@ template clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); @@ -73,7 +73,7 @@ template <> clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { @@ -88,7 +88,7 @@ template <> clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {