Merge branch 'cpu_blas' into development

This commit is contained in:
cnugteren 2016-04-03 16:08:48 -07:00
commit 2981ca4d3c
59 changed files with 3624 additions and 657 deletions

View file

@ -2,6 +2,7 @@
Development version (next release)
- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter)
- Made the library thread-safe
- Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries
- Fixed the use of events within the library
- Added level-1 routines:
* SNRM2/DNRM2/ScNRM2/DzNRM2

View file

@ -66,7 +66,7 @@ else ()
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
set(FLAGS "${FLAGS} -Wall -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
endif()
@ -98,11 +98,13 @@ if(TUNERS)
endif()
endif()
# Locates the clBLAS library in case the tests need to be compiled. "FindclBLAS.cmake" is included.
# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
# and "FindCBLAS.cmake" are included.
if(TESTS)
find_package(clBLAS)
if(NOT CLBLAS_FOUND)
message(STATUS "Could NOT find clBLAS, disabling the compilation of the tests")
find_package(CBLAS)
if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
set(TESTS OFF)
endif()
endif()
@ -215,11 +217,33 @@ endif()
# ==================================================================================================
# Down from here is all test (performance and correctness) related. Note that these tests require
# the presence of the clBLAS library to act as a reference.
# the presence of clBLAS and/or a BLAS library to act as a reference.
if(TESTS)
# Adds new include directories for the reference clBLAS
include_directories(${clblast_SOURCE_DIR}/test ${CLBLAS_INCLUDE_DIRS})
# Sets the specifics for the reference BLAS libraries
set(REF_INCLUDES )
set(REF_LIBRARIES )
if(CLBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
add_definitions(" /DCLBLAST_REF_CLBLAS")
else()
add_definitions(" -DCLBLAST_REF_CLBLAS")
endif()
endif()
if(CBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
add_definitions(" /DCLBLAST_REF_CBLAS")
else()
add_definitions(" -DCLBLAST_REF_CBLAS")
endif()
endif()
# Sets the include directories
include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})
# Creates the common correctness-tests objects (requires CMake 2.8.8)
add_library(test_correctness_common OBJECT
@ -239,7 +263,7 @@ if(TESTS)
test/correctness/routines/level3/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(clblast_test_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
endforeach()
@ -269,7 +293,7 @@ if(TESTS)
test/performance/routines/level3/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(clblast_client_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
endforeach()

View file

@ -52,6 +52,14 @@ The pre-requisites for compilation of CLBlast are:
- Intel OpenCL
- Beignet
Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either:
* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD)
* A regular CPU Netlib BLAS library, e.g.:
- OpenBLAS
- BLIS
- Accelerate
An example of an out-of-source build (starting from the root of the CLBlast folder):
mkdir build
@ -135,9 +143,9 @@ To make sure CLBlast is working correctly on your device (recommended), compile
cmake -DTESTS=ON ..
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. The library clBLAS is therefore required to be installed on your system for the CLBlast tests.
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against.
With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test.
With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library.
Performance remarks
@ -249,4 +257,3 @@ To-do list before release of version 1.0
- Support all routines supported by clBLAS
- Allow the user control over events and synchronization
- Add half-precision routines (e.g. HGEMM)
- Enable correctness and performance testing against a CPU-based BLAS library

View file

@ -0,0 +1,75 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
# width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# ==================================================================================================
#
# Defines the following variables:
# CBLAS_FOUND Boolean holding whether or not the Netlib BLAS library was found
# CBLAS_INCLUDE_DIRS The Netlib BLAS include directory
# CBLAS_LIBRARIES The Netlib BLAS library
#
# In case BLAS is not installed in the default directory, set the CBLAS_ROOT variable to point to
# the root of BLAS, such that 'cblas.h' can be found in $CBLAS_ROOT/include. This can either be
# done using an environmental variable (e.g. export CBLAS_ROOT=/path/to/BLAS) or using a CMake
# variable (e.g. cmake -DCBLAS_ROOT=/path/to/BLAS ..).
#
# ==================================================================================================
# Sets the possible install locations
set(CBLAS_HINTS
${CBLAS_ROOT}
$ENV{CBLAS_ROOT}
)
set(CBLAS_PATHS
/usr
/usr/local
/usr/local/opt
/System/Library/Frameworks
)
# Finds the include directories
find_path(CBLAS_INCLUDE_DIRS
NAMES cblas.h
HINTS ${CBLAS_HINTS}
PATH_SUFFIXES
include inc include/x86_64 include/x64
openblas/include include/blis blis/include blis/include/blis
Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers
PATHS ${CBLAS_PATHS}
DOC "Netlib BLAS include header cblas.h"
)
mark_as_advanced(CBLAS_INCLUDE_DIRS)
# Finds the library
find_library(CBLAS_LIBRARIES
NAMES blas mkl blis openblas atlas accelerate
HINTS ${CBLAS_HINTS}
PATH_SUFFIXES
lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
openblas/lib blis/lib
PATHS ${CBLAS_PATHS}
DOC "Netlib BLAS library"
)
mark_as_advanced(CBLAS_LIBRARIES)
# ==================================================================================================
# Notification messages
if(NOT CBLAS_INCLUDE_DIRS)
message(STATUS "Could NOT find 'cblas.h', install a CPU Netlib BLAS or set CBLAS_ROOT")
endif()
if(NOT CBLAS_LIBRARIES)
message(STATUS "Could NOT find a CPU Netlib BLAS library, install it or set CBLAS_ROOT")
endif()
# Determines whether or not BLAS was found
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES)
# ==================================================================================================

View file

@ -100,7 +100,7 @@ template <typename T>
StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
cl_mem sy1_buffer, const size_t sy1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event = nullptr);

View file

@ -112,13 +112,13 @@ StatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
StatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
cl_mem sy1_buffer, const size_t sy1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
cl_mem sy1_buffer, const size_t sy1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event);

View file

@ -465,31 +465,33 @@ class Buffer {
}
// Copies from device to host: reading the device buffer a-synchronously
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) {
const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
ReadAsync(queue, size, host.data(), offset);
}
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) {
const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
ReadAsync(queue, size, host.data(), offset);
}
// Copies from device to host: reading the device buffer
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
ReadAsync(queue, size, host, offset);
queue.Finish();
}
void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
void Read(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) const {
Read(queue, size, host.data(), offset);
}
void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) const {
Read(queue, size, host.data(), offset);
}

View file

@ -35,6 +35,9 @@ using double2 = std::complex<double>;
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
// Catched an unknown error
constexpr auto kUnknownError = -999;
// =================================================================================================
// The routine-specific arguments in string form
@ -70,6 +73,7 @@ constexpr auto kArgFraction = "fraction";
// The client-specific arguments in string form
constexpr auto kArgCompareclblas = "clblas";
constexpr auto kArgComparecblas = "cblas";
constexpr auto kArgStepSize = "step";
constexpr auto kArgNumSteps = "num_steps";
constexpr auto kArgNumRuns = "runs";
@ -128,6 +132,7 @@ struct Arguments {
double fraction = 1.0;
// Client-specific arguments
int compare_clblas = 1;
int compare_cblas = 1;
size_t step = 1;
size_t num_steps = 0;
size_t num_runs = 10;

View file

@ -58,5 +58,10 @@ class DataType():
return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp
# Current scalar is complex
def IsComplex(self, scalar):
return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or
(scalar == "beta" and self.beta_cpp in [FLT2, DBL2]))
# ==================================================================================================

View file

@ -8,12 +8,13 @@
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This script automatically generates the bodies of the following files, creating the full CLBlast
# API interface and implementation (C, C++, and clBLAS wrapper):
# API interface and implementation (C, C++, and reference BLAS wrappers):
# clblast.h
# clblast.cc
# clblast_c.h
# clblast_c.cc
# wrapper_clblas.h
# wrapper_cblas.h
# It also generates the main functions for the correctness and performance tests as found in
# test/correctness/routines/levelX/xYYYY.cc
# test/performance/routines/levelX/xYYYY.cc
@ -55,7 +56,7 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T")
routines = [
[ # Level 1: vector-vector
Routine(False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"),
Routine(False, "1", "rotmg", T, [S,D], [], [], [], ["sd1","sd2","sx1","sy1","sparam"], [], "", "Generate modified givens plane rotation"),
Routine(False, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation"),
Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"),
Routine(False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"),
Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"),
@ -220,11 +221,11 @@ def wrapper_clblas(routines):
for routine in routines:
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
if routine.NoScalars():
result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n"
result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
for flavour in routine.flavours:
indent = " "*(17 + routine.Length())
result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n"
arguments = routine.ArgumentsWrapper(flavour)
result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
arguments = routine.ArgumentsWrapperCL(flavour)
if routine.scratch:
result += " auto queue = Queue(queues[0]);\n"
result += " auto context = queue.GetContext();\n"
@ -236,6 +237,41 @@ def wrapper_clblas(routines):
result += "\n}\n"
return result
# The wrapper to the reference CBLAS routines (for performance/correctness testing)
def wrapper_cblas(routines):
result = ""
for routine in routines:
result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames())
for flavour in routine.flavours:
indent = " "*(10 + routine.Length())
result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
arguments = routine.ArgumentsWrapperC(flavour)
# Double-precision scalars
for scalar in routine.scalars:
if flavour.IsComplex(scalar):
result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
# Special case for scalar outputs
assignment = ""
postfix = ""
extra_argument = ""
for output_buffer in routine.outputs:
if output_buffer in routine.ScalarBuffersFirst():
if flavour in [C,Z]:
postfix += "_sub"
indent += " "
extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
else:
assignment = output_buffer+"_buffer["+output_buffer+"_offset] = "
indent += " "*len(assignment)
result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
result += (",\n"+indent).join([a for a in arguments])
result += extra_argument+");"
result += "\n}\n"
return result
# ==================================================================================================
# Checks for the number of command-line arguments
@ -251,9 +287,10 @@ files = [
path_clblast+"/include/clblast_c.h",
path_clblast+"/src/clblast_c.cc",
path_clblast+"/test/wrapper_clblas.h",
path_clblast+"/test/wrapper_cblas.h",
]
header_lines = [84, 65, 93, 22, 22]
footer_lines = [6, 3, 9, 2, 6]
header_lines = [84, 65, 93, 22, 22, 38]
footer_lines = [6, 3, 9, 2, 6, 6]
# Checks whether the command-line arguments are valid; exists otherwise
for f in files:
@ -287,6 +324,8 @@ for i in xrange(0,len(files)):
body += clblast_c_cc(routines[level-1])
if i == 4:
body += wrapper_clblas(routines[level-1])
if i == 5:
body += wrapper_cblas(routines[level-1])
f.write("".join(file_header))
f.write(body)
f.write("".join(file_footer))

View file

@ -28,7 +28,7 @@ def OptionToCLBlast(x):
}[x]
# As above, but for clBLAS data-types
def OptionToWrapper(x):
def OptionToWrapperCL(x):
return {
'layout': "clblasOrder",
'a_transpose': "clblasTranspose",
@ -39,6 +39,18 @@ def OptionToWrapper(x):
'diagonal': "clblasDiag",
}[x]
# As above, but for CBLAS data-types
def OptionToWrapperC(x):
return {
'layout': "CBLAS_ORDER",
'a_transpose': "CBLAS_TRANSPOSE",
'b_transpose': "CBLAS_TRANSPOSE",
'ab_transpose': "CBLAS_TRANSPOSE",
'side': "CBLAS_SIDE",
'triangle': "CBLAS_UPLO",
'diagonal': "CBLAS_DIAG",
}[x]
# ==================================================================================================
# Class holding routine-specific information (e.g. name, which arguments, which precisions)
@ -119,6 +131,16 @@ class Routine():
return [", ".join(a+b+c)]
return []
# As above but as vectors
def BufferDefVector(self, name, flavour):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"]
b = ["const size_t "+name+"_offset"]
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but with Claduc buffers
def BufferCladuc(self, name):
if (name in self.inputs) or (name in self.outputs):
@ -129,7 +151,7 @@ class Routine():
return []
# As above but with a static cast for clBLAS wrapper
def BufferWrapper(self, name):
def BufferWrapperCL(self, name):
if (name in self.inputs) or (name in self.outputs):
a = [name+"_buffer"]
b = [name+"_offset"]
@ -141,6 +163,24 @@ class Routine():
return [", ".join(a+b+c)]
return []
# As above but with a static cast for CBLAS wrapper
def BufferWrapperC(self, name, flavour):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
if name == "sy1":
a = [name+"_buffer["+name+"_offset]"]
elif flavour.precision_name in ["C","Z"]:
a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"]
else:
a = ["&"+name+"_buffer["+name+"_offset]"]
c = []
if (name in ["x","y"]):
c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
elif (name in ["a","b","c"]):
c = [name+"_"+self.Postfix(name)]
return [", ".join(a+c)]
return []
# As above, but only data-types
def BufferType(self, name):
prefix = "const " if (name in self.inputs) else ""
@ -179,6 +219,14 @@ class Routine():
return [name]
return []
# Retrieves the use of a scalar for CBLAS (alpha/beta)
def ScalarUseWrapperC(self, name, flavour):
if name in self.scalars:
if flavour.IsComplex(name):
return [name+"_array.data()"]
return [name]
return []
# Retrieves the definition of a scalar (alpha/beta)
def ScalarDef(self, name, flavour):
if name in self.scalars:
@ -246,9 +294,16 @@ class Routine():
return []
# As above, but now using clBLAS data-types
def OptionsDefWrapper(self):
def OptionsDefWrapperCL(self):
if self.options:
definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options]
definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options]
return [", ".join(definitions)]
return []
# As above, but now using CBLAS data-types
def OptionsDefWrapperC(self):
if self.options:
definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options]
return [", ".join(definitions)]
return []
@ -284,16 +339,26 @@ class Routine():
list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()])))
# As above, but for the clBLAS wrapper
def ArgumentsWrapper(self, flavour):
def ArgumentsWrapperCL(self, flavour):
return (self.Options() + self.Sizes() +
list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersFirst()])) +
list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarUseWrapper("alpha", flavour) +
list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) +
list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) +
self.ScalarUseWrapper("beta", flavour) +
list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()])))
# As above, but for the CBLAS wrapper
def ArgumentsWrapperC(self, flavour):
return (self.Options() + self.Sizes() +
self.ScalarUseWrapperC("alpha", flavour) +
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) +
self.ScalarUseWrapperC("beta", flavour) +
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument definitions
def ArgumentsDef(self, flavour):
return (self.OptionsDef() + self.SizesDef() +
@ -306,8 +371,8 @@ class Routine():
list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()])))
# As above, but clBLAS wrapper plain datatypes
def ArgumentsDefWrapper(self, flavour):
return (self.OptionsDefWrapper() + self.SizesDef() +
def ArgumentsDefWrapperCL(self, flavour):
return (self.OptionsDefWrapperCL() + self.SizesDef() +
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarDefPlain("alpha", flavour) +
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
@ -315,6 +380,17 @@ class Routine():
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
# As above, but CBLAS wrapper plain datatypes
def ArgumentsDefWrapperC(self, flavour):
return (self.OptionsDefWrapperC() + self.SizesDef() +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) +
self.ScalarDefPlain("alpha", flavour) +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) +
self.ScalarDefPlain("beta", flavour) +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument types
def ArgumentsType(self, flavour):
@ -356,7 +432,7 @@ class Routine():
return result
# As above, but now for the clBLAS wrapper
def RoutineHeaderWrapper(self, flavour, def_only, spaces):
def RoutineHeaderWrapperCL(self, flavour, def_only, spaces):
template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
indent = " "*(spaces + self.Length() + len(template))
result = ""
@ -366,9 +442,16 @@ class Routine():
result += flavour.name
result += ">\n"
result += "clblasStatus clblasX"+self.name+template+"("
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)])
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)])
result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
return result
# As above, but now for the CBLAS wrapper
def RoutineHeaderWrapperC(self, flavour, def_only, spaces):
indent = " "*(spaces + self.Length())
result = "void cblasX"+self.name+"("
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")"
return result
# ==================================================================================================

View file

@ -93,7 +93,7 @@ template <typename T>
StatusCode Rotmg(cl_mem, const size_t,
cl_mem, const size_t,
cl_mem, const size_t,
cl_mem, const size_t,
const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
@ -101,13 +101,13 @@ StatusCode Rotmg(cl_mem, const size_t,
template StatusCode PUBLIC_API Rotmg<float>(cl_mem, const size_t,
cl_mem, const size_t,
cl_mem, const size_t,
cl_mem, const size_t,
const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Rotmg<double>(cl_mem, const size_t,
cl_mem, const size_t,
cl_mem, const size_t,
cl_mem, const size_t,
const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);

View file

@ -55,7 +55,7 @@ StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
cl_mem sy1_buffer, const size_t sy1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rotmg<float>(sd1_buffer, sd1_offset,
@ -69,7 +69,7 @@ StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
cl_mem sy1_buffer, const size_t sy1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rotmg<double>(sd1_buffer, sd1_offset,

View file

@ -79,24 +79,6 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
// Iterates over all the to-be-tested combinations of arguments
for (auto &args: test_vector) {
// Runs the reference clBLAS code
auto x_vec1 = Buffer<T>(context_, args.x_size);
auto y_vec1 = Buffer<T>(context_, args.y_size);
auto a_mat1 = Buffer<T>(context_, args.a_size);
auto b_mat1 = Buffer<T>(context_, args.b_size);
auto c_mat1 = Buffer<T>(context_, args.c_size);
auto ap_mat1 = Buffer<T>(context_, args.ap_size);
auto scalar1 = Buffer<T>(context_, args.scalar_size);
x_vec1.Write(queue_, args.x_size, x_source_);
y_vec1.Write(queue_, args.y_size, y_source_);
a_mat1.Write(queue_, args.a_size, a_source_);
b_mat1.Write(queue_, args.b_size, b_source_);
c_mat1.Write(queue_, args.c_size, c_source_);
ap_mat1.Write(queue_, args.ap_size, ap_source_);
scalar1.Write(queue_, args.scalar_size, scalar_source_);
auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
auto status1 = run_reference_(args, buffers1, queue_);
// Runs the CLBlast code
auto x_vec2 = Buffer<T>(context_, args.x_size);
auto y_vec2 = Buffer<T>(context_, args.y_size);
@ -115,6 +97,33 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
auto status2 = run_routine_(args, buffers2, queue_);
#ifndef CLBLAST_REF_CLBLAS
// Don't continue with CBLAS if there are incorrect parameters
if (status2 != StatusCode::kSuccess) {
// TODO: Mark this as a skipped test instead of a succesfull test
TestErrorCodes(status2, status2, args);
continue;
}
#endif
// Runs the reference BLAS code
auto x_vec1 = Buffer<T>(context_, args.x_size);
auto y_vec1 = Buffer<T>(context_, args.y_size);
auto a_mat1 = Buffer<T>(context_, args.a_size);
auto b_mat1 = Buffer<T>(context_, args.b_size);
auto c_mat1 = Buffer<T>(context_, args.c_size);
auto ap_mat1 = Buffer<T>(context_, args.ap_size);
auto scalar1 = Buffer<T>(context_, args.scalar_size);
x_vec1.Write(queue_, args.x_size, x_source_);
y_vec1.Write(queue_, args.y_size, y_source_);
a_mat1.Write(queue_, args.a_size, a_source_);
b_mat1.Write(queue_, args.b_size, b_source_);
c_mat1.Write(queue_, args.c_size, c_source_);
ap_mat1.Write(queue_, args.ap_size, ap_source_);
scalar1.Write(queue_, args.scalar_size, scalar_source_);
auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
auto status1 = run_reference_(args, buffers1, queue_);
// Tests for equality of the two status codes
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
TestErrorCodes(status1, status2, args);

View file

@ -68,7 +68,7 @@ class TestBlas: public Tester<T,U> {
static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
// Shorthand for the routine-specific functions passed to the tester
using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers<T>&, Queue&)>;
using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers<T>&, Queue&)>;
using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
using ResultIterator = std::function<size_t(const Arguments<U>&)>;
@ -76,8 +76,9 @@ class TestBlas: public Tester<T,U> {
// Constructor, initializes the base class tester and input data
TestBlas(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine run_routine, const Routine run_reference, const ResultGet get_result,
const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2);
const Routine run_routine, const Routine run_reference,
const ResultGet get_result, const ResultIndex get_index,
const ResultIterator get_id1, const ResultIterator get_id2);
// The test functions, taking no inputs
void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name);
@ -110,9 +111,17 @@ class TestBlas: public Tester<T,U> {
template <typename C, typename T, typename U>
void RunTests(int argc, char *argv[], const bool silent, const std::string &name) {
// Sets the reference to test against
#ifdef CLBLAST_REF_CLBLAS
const auto reference_routine = C::RunReference1; // clBLAS when available
#else
const auto reference_routine = C::RunReference2; // otherwise CBLAS
#endif
// Creates a tester
auto options = C::GetOptions();
TestBlas<T,U> tester{argc, argv, silent, name, options, C::RunRoutine, C::RunReference,
TestBlas<T,U> tester{argc, argv, silent, name, options,
C::RunRoutine, reference_routine,
C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2};
// This variable holds the arguments relevant for this routine
@ -250,23 +259,25 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<U>>{};
auto i_args = args;
i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
for (auto &x_size: x_sizes) { i_args.x_size = x_size;
for (auto &y_size: y_sizes) { i_args.y_size = y_size;
for (auto &a_size: a_sizes) { i_args.a_size = a_size;
for (auto &b_size: b_sizes) { i_args.b_size = b_size;
for (auto &c_size: c_sizes) { i_args.c_size = c_size;
for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size;
invalid_test_vector.push_back(i_args);
#ifdef CLBLAST_REF_CLBLAS
auto invalid_test_vector = std::vector<Arguments<U>>{};
auto i_args = args;
i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
for (auto &x_size: x_sizes) { i_args.x_size = x_size;
for (auto &y_size: y_sizes) { i_args.y_size = y_size;
for (auto &a_size: a_sizes) { i_args.a_size = a_size;
for (auto &b_size: b_sizes) { i_args.b_size = b_size;
for (auto &c_size: c_sizes) { i_args.c_size = c_size;
for (auto &ap_size: ap_sizes) { i_args.ap_size = ap_size;
invalid_test_vector.push_back(i_args);
}
}
}
}
}
}
}
#endif
// Sets the name of this test-case
auto names = std::vector<std::string>{};
@ -287,7 +298,9 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
// Runs the tests
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
#ifdef CLBLAST_REF_CLBLAS
tester.TestInvalid(invalid_test_vector, case_name);
#endif
}
}
}

View file

@ -69,10 +69,12 @@ Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
kUnsupportedPrecision.c_str());
// Initializes clBLAS
auto status = clblasSetup();
if (status != CL_SUCCESS) {
throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status)));
}
#ifdef CLBLAST_REF_CLBLAS
auto status = clblasSetup();
if (status != CL_SUCCESS) {
throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status)));
}
#endif
}
// Destructor prints the summary of the test cases and cleans-up the clBLAS library
@ -87,7 +89,11 @@ Tester<T,U>::~Tester() {
fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
}
fprintf(stdout, "\n");
clblasTeardown();
// Cleans-up clBLAS
#ifdef CLBLAST_REF_CLBLAS
clblasTeardown();
#endif
}
// =================================================================================================

View file

@ -23,7 +23,9 @@
#include <memory>
// The libraries
#include <clBLAS.h>
#ifdef CLBLAST_REF_CLBLAS
#include <clBLAS.h>
#endif
#include "clblast.h"
#include "internal/utilities.h"
@ -92,7 +94,7 @@ class Tester {
Queue queue_;
// Whether or not to run the full test-suite or just a smoke test
bool full_test_;
const bool full_test_;
// Retrieves the offset values to test with
const std::vector<size_t> GetOffsets() const;

View file

@ -24,11 +24,13 @@ namespace clblast {
// Constructor
template <typename T, typename U>
Client<T,U>::Client(const Routine run_routine, const Routine run_reference,
Client<T,U>::Client(const Routine run_routine,
const Routine run_reference1, const Routine run_reference2,
const std::vector<std::string> &options,
const GetMetric get_flops, const GetMetric get_bytes):
run_routine_(run_routine),
run_reference_(run_reference),
run_reference1_(run_reference1),
run_reference2_(run_reference2),
options_(options),
get_flops_(get_flops),
get_bytes_(get_bytes) {
@ -90,7 +92,16 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
#ifdef CLBLAST_REF_CLBLAS
args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
#else
args.compare_clblas = 0;
#endif
#ifdef CLBLAST_REF_CBLAS
args.compare_cblas = GetArgument(argc, argv, help, kArgComparecblas, 1);
#else
args.compare_cblas = 0;
#endif
args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
@ -120,7 +131,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
auto device = Device(platform, args.device_id);
auto context = Context(device);
auto queue = Queue(context, device);
if (args.compare_clblas) { clblasSetup(); }
#ifdef CLBLAST_REF_CLBLAS
if (args.compare_clblas) { clblasSetup(); }
#endif
// Iterates over all "num_step" values jumping by "step" each time
auto s = size_t{0};
@ -167,9 +180,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
if (args.compare_clblas) {
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
}
if (args.compare_cblas) {
auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS");
timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas));
}
// Prints the performance of the tested libraries
PrintTableRow(args, timings);
@ -186,7 +203,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
}
// Cleans-up and returns
if (args.compare_clblas) { clblasTeardown(); }
#ifdef CLBLAST_REF_CLBLAS
if (args.compare_clblas) { clblasTeardown(); }
#endif
}
// =================================================================================================
@ -196,14 +215,17 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
// value found in the vector of timing results. The return value is in milliseconds.
template <typename T, typename U>
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
const Buffers<T> &buffers, Queue &queue,
Buffers<T> &buffers, Queue &queue,
Routine run_blas, const std::string &library_name) {
auto timings = std::vector<double>(num_runs);
for (auto &timing: timings) {
auto start_time = std::chrono::steady_clock::now();
// Executes the main computation
auto status = run_blas(args, buffers, queue);
auto status = StatusCode::kSuccess;
try {
status = run_blas(args, buffers, queue);
} catch (...) { status = static_cast<StatusCode>(kUnknownError); }
if (status != StatusCode::kSuccess) {
throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
}
@ -226,6 +248,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
fprintf(stdout, " | <-- CLBlast -->");
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); }
fprintf(stdout, " |\n");
}
@ -233,6 +256,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
fprintf(stdout, "\n");
}

View file

@ -26,7 +26,9 @@
#include <utility>
// The libraries to test
#include <clBLAS.h>
#ifdef CLBLAST_REF_CLBLAS
#include <clBLAS.h>
#endif
#include "clblast.h"
#include "internal/utilities.h"
@ -40,12 +42,12 @@ class Client {
public:
// Shorthand for the routine-specific functions passed to the tester
using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers<T>&, Queue&)>;
using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
using SetMetric = std::function<void(Arguments<U>&)>;
using GetMetric = std::function<size_t(const Arguments<U>&)>;
// The constructor
Client(const Routine run_routine, const Routine run_reference,
Client(const Routine run_routine, const Routine run_reference1, const Routine run_reference2,
const std::vector<std::string> &options,
const GetMetric get_flops, const GetMetric get_bytes);
@ -61,7 +63,7 @@ class Client {
private:
// Runs a function a given number of times and returns the execution time of the shortest instance
double TimedExecution(const size_t num_runs, const Arguments<U> &args, const Buffers<T> &buffers,
double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers,
Queue &queue, Routine run_blas, const std::string &library_name);
// Prints the header of a performance-data table
@ -73,7 +75,8 @@ class Client {
// The routine-specific functions passed to the tester
const Routine run_routine_;
const Routine run_reference_;
const Routine run_reference1_;
const Routine run_reference2_;
const std::vector<std::string> options_;
const GetMetric get_flops_;
const GetMetric get_bytes_;
@ -81,13 +84,31 @@ class Client {
// =================================================================================================
// Bogus reference function, in case a comparison library is not available
template <typename T, typename U>
static StatusCode ReferenceNotAvailable(const Arguments<U> &, Buffers<T> &, Queue &) {
return StatusCode::kNotImplemented;
}
// The interface to the performance client. This is a separate function in the header such that it
// is automatically compiled for each routine, templated by the parameter "C".
template <typename C, typename T, typename U>
void RunClient(int argc, char *argv[]) {
// Sets the reference to test against
#ifdef CLBLAST_REF_CLBLAS
const auto reference1 = C::RunReference1; // clBLAS when available
#else
const auto reference1 = ReferenceNotAvailable<T,U>;
#endif
#ifdef CLBLAST_REF_CBLAS
const auto reference2 = C::RunReference2; // CBLAS when available
#else
const auto reference2 = ReferenceNotAvailable<T,U>;
#endif
// Creates a new client
auto client = Client<T,U>(C::RunRoutine, C::RunReference, C::GetOptions(),
auto client = Client<T,U>(C::RunRoutine, reference1, reference2, C::GetOptions(),
C::GetFlops, C::GetBytes);
// Simple command line argument parser with defaults

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -65,7 +70,7 @@ class TestXaxpy {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Axpy(args.n, args.alpha,
@ -77,16 +82,33 @@ class TestXaxpy {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXaxpy(args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXaxpy(args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXaxpy(args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -64,7 +69,7 @@ class TestXcopy {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Copy<T>(args.n,
@ -76,16 +81,33 @@ class TestXcopy {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXcopy<T>(args.n,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXcopy<T>(args.n,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXcopy(args.n,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -68,7 +73,7 @@ class TestXdot {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Dot<T>(args.n,
@ -81,17 +86,37 @@ class TestXdot {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXdot<T>(args.n,
buffers.scalar(), args.dot_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXdot<T>(args.n,
buffers.scalar(), args.dot_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXdot(args.n,
scalar_cpu, args.dot_offset,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -68,7 +73,7 @@ class TestXdotc {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Dotc<T>(args.n,
@ -81,17 +86,37 @@ class TestXdotc {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXdotc<T>(args.n,
buffers.scalar(), args.dot_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXdotc<T>(args.n,
buffers.scalar(), args.dot_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXdotc(args.n,
scalar_cpu, args.dot_offset,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -68,7 +73,7 @@ class TestXdotu {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Dotu<T>(args.n,
@ -81,17 +86,37 @@ class TestXdotu {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXdotu<T>(args.n,
buffers.scalar(), args.dot_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXdotu<T>(args.n,
buffers.scalar(), args.dot_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXdotu(args.n,
scalar_cpu, args.dot_offset,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -64,7 +69,7 @@ class TestXnrm2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Nrm2<T>(args.n,
@ -76,16 +81,33 @@ class TestXnrm2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXnrm2<T>(args.n,
buffers.scalar(), args.nrm2_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXnrm2<T>(args.n,
buffers.scalar(), args.nrm2_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXnrm2(args.n,
scalar_cpu, args.nrm2_offset,
x_vec_cpu, args.x_offset, args.x_inc);
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -61,7 +66,7 @@ class TestXscal {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Scal(args.n, args.alpha,
@ -72,15 +77,29 @@ class TestXscal {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXscal(args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXscal(args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXscal(args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc);
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -64,7 +69,7 @@ class TestXswap {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Swap<T>(args.n,
@ -76,16 +81,34 @@ class TestXswap {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXswap<T>(args.n,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXswap<T>(args.n,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXswap(args.n,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -76,7 +81,7 @@ class TestXgbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gbmv(args.layout, args.a_transpose,
@ -90,19 +95,41 @@ class TestXgbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
static_cast<clblasTranspose>(args.a_transpose),
args.m, args.n, args.kl, args.ku, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
static_cast<clblasTranspose>(args.a_transpose),
args.m, args.n, args.kl, args.ku, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXgbmv(convertToCBLAS(args.layout),
convertToCBLAS(args.a_transpose),
args.m, args.n, args.kl, args.ku, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -76,7 +81,7 @@ class TestXgemv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gemv(args.layout, args.a_transpose,
@ -90,19 +95,41 @@ class TestXgemv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
static_cast<clblasTranspose>(args.a_transpose),
args.m, args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
static_cast<clblasTranspose>(args.a_transpose),
args.m, args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXgemv(convertToCBLAS(args.layout),
convertToCBLAS(args.a_transpose),
args.m, args.n, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -72,7 +77,7 @@ class TestXger {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Ger(args.layout,
@ -86,18 +91,39 @@ class TestXger {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXger(static_cast<clblasOrder>(args.layout),
args.m, args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXger(static_cast<clblasOrder>(args.layout),
args.m, args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXger(convertToCBLAS(args.layout),
args.m, args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc,
a_mat_cpu, args.a_offset, args.a_ld);
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -72,7 +77,7 @@ class TestXgerc {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gerc(args.layout,
@ -86,18 +91,39 @@ class TestXgerc {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgerc(static_cast<clblasOrder>(args.layout),
args.m, args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgerc(static_cast<clblasOrder>(args.layout),
args.m, args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXgerc(convertToCBLAS(args.layout),
args.m, args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc,
a_mat_cpu, args.a_offset, args.a_ld);
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -72,7 +77,7 @@ class TestXgeru {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Geru(args.layout,
@ -86,18 +91,39 @@ class TestXgeru {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgeru(static_cast<clblasOrder>(args.layout),
args.m, args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgeru(static_cast<clblasOrder>(args.layout),
args.m, args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXgeru(convertToCBLAS(args.layout),
args.m, args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc,
a_mat_cpu, args.a_offset, args.a_ld);
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -70,7 +75,7 @@ class TestXhbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hbmv(args.layout, args.triangle,
@ -84,19 +89,41 @@ class TestXhbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.kl, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.kl, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXhbmv(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.kl, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -70,7 +75,7 @@ class TestXhemv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hemv(args.layout, args.triangle,
@ -84,19 +89,41 @@ class TestXhemv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhemv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhemv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXhemv(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -66,7 +71,7 @@ class TestXher {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Her(args.layout, args.triangle,
@ -79,18 +84,37 @@ class TestXher {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXher(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXher(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXher(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
a_mat_cpu, args.a_offset, args.a_ld);
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -70,7 +75,7 @@ class TestXher2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Her2(args.layout, args.triangle,
@ -84,19 +89,41 @@ class TestXher2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXher2(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXher2(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXher2(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc,
a_mat_cpu, args.a_offset, args.a_ld);
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -70,7 +75,7 @@ class TestXhpmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hpmv(args.layout, args.triangle,
@ -84,19 +89,41 @@ class TestXhpmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.ap_mat(), args.ap_offset,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.ap_mat(), args.ap_offset,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXhpmv(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
ap_mat_cpu, args.ap_offset,
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -66,7 +71,7 @@ class TestXhpr {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hpr(args.layout, args.triangle,
@ -79,18 +84,37 @@ class TestXhpr {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhpr(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.ap_mat(), args.ap_offset,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhpr(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.ap_mat(), args.ap_offset,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXhpr(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
ap_mat_cpu, args.ap_offset);
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -70,7 +75,7 @@ class TestXhpr2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hpr2(args.layout, args.triangle,
@ -84,19 +89,41 @@ class TestXhpr2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhpr2(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.ap_mat(), args.ap_offset,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhpr2(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.ap_mat(), args.ap_offset,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXhpr2(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc,
ap_mat_cpu, args.ap_offset);
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -70,7 +75,7 @@ class TestXsbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Sbmv(args.layout, args.triangle,
@ -84,19 +89,41 @@ class TestXsbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.kl, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.kl, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXsbmv(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.kl, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -70,7 +75,7 @@ class TestXspmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Spmv(args.layout, args.triangle,
@ -84,19 +89,41 @@ class TestXspmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXspmv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.ap_mat(), args.ap_offset,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXspmv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.ap_mat(), args.ap_offset,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXspmv(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
ap_mat_cpu, args.ap_offset,
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -66,7 +71,7 @@ class TestXspr {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Spr(args.layout, args.triangle,
@ -79,18 +84,37 @@ class TestXspr {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXspr(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.ap_mat(), args.ap_offset,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXspr(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.ap_mat(), args.ap_offset,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXspr(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
ap_mat_cpu, args.ap_offset);
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -70,7 +75,7 @@ class TestXspr2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Spr2(args.layout, args.triangle,
@ -84,19 +89,41 @@ class TestXspr2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXspr2(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.ap_mat(), args.ap_offset,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXspr2(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.ap_mat(), args.ap_offset,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXspr2(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc,
ap_mat_cpu, args.ap_offset);
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -70,7 +75,7 @@ class TestXsymv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Symv(args.layout, args.triangle,
@ -84,19 +89,41 @@ class TestXsymv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsymv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsymv(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXsymv(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
y_vec_cpu, args.y_offset, args.y_inc);
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -66,7 +71,7 @@ class TestXsyr {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syr(args.layout, args.triangle,
@ -79,18 +84,37 @@ class TestXsyr {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyr(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyr(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXsyr(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
a_mat_cpu, args.a_offset, args.a_ld);
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -70,7 +75,7 @@ class TestXsyr2 {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syr2(args.layout, args.triangle,
@ -84,19 +89,41 @@ class TestXsyr2 {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyr2(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyr2(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
args.n, args.alpha,
buffers.x_vec(), args.x_offset, args.x_inc,
buffers.y_vec(), args.y_offset, args.y_inc,
buffers.a_mat(), args.a_offset, args.a_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
cblasXsyr2(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
args.n, args.alpha,
x_vec_cpu, args.x_offset, args.x_inc,
y_vec_cpu, args.y_offset, args.y_inc,
a_mat_cpu, args.a_offset, args.a_ld);
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -65,7 +70,7 @@ class TestXtbmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
@ -78,20 +83,41 @@ class TestXtbmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasDiag>(args.diagonal),
args.n, args.kl,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasDiag>(args.diagonal),
args.n, args.kl,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXtbmv(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
convertToCBLAS(args.a_transpose),
convertToCBLAS(args.diagonal),
args.n, args.kl,
a_mat_cpu, args.a_offset, args.a_ld,
x_vec_cpu, args.x_offset, args.x_inc);
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -65,7 +70,7 @@ class TestXtpmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
@ -78,20 +83,41 @@ class TestXtpmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasDiag>(args.diagonal),
args.n,
buffers.ap_mat(), args.ap_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasDiag>(args.diagonal),
args.n,
buffers.ap_mat(), args.ap_offset,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXtpmv(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
convertToCBLAS(args.a_transpose),
convertToCBLAS(args.diagonal),
args.n,
ap_mat_cpu, args.ap_offset,
x_vec_cpu, args.x_offset, args.x_inc);
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -65,7 +70,7 @@ class TestXtrmv {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
@ -78,20 +83,41 @@ class TestXtrmv {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasDiag>(args.diagonal),
args.n,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasDiag>(args.diagonal),
args.n,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXtrmv(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
convertToCBLAS(args.a_transpose),
convertToCBLAS(args.diagonal),
args.n,
a_mat_cpu, args.a_offset, args.a_ld,
x_vec_cpu, args.x_offset, args.x_inc);
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -78,7 +83,7 @@ class TestXgemm {
static Transposes GetBTransposes(const Transposes &all) { return all; }
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
@ -92,20 +97,43 @@ class TestXgemm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasTranspose>(args.b_transpose),
args.m, args.n, args.k, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasTranspose>(args.b_transpose),
args.m, args.n, args.k, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
cblasXgemm(convertToCBLAS(args.layout),
convertToCBLAS(args.a_transpose),
convertToCBLAS(args.b_transpose),
args.m, args.n, args.k, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
c_mat_cpu, args.c_offset, args.c_ld);
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -78,7 +83,7 @@ class TestXhemm {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hemm(args.layout, args.side, args.triangle,
@ -92,20 +97,43 @@ class TestXhemm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
static_cast<clblasSide>(args.side),
static_cast<clblasUplo>(args.triangle),
args.m, args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
static_cast<clblasSide>(args.side),
static_cast<clblasUplo>(args.triangle),
args.m, args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
cblasXhemm(convertToCBLAS(args.layout),
convertToCBLAS(args.side),
convertToCBLAS(args.triangle),
args.m, args.n, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
c_mat_cpu, args.c_offset, args.c_ld);
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -76,7 +81,7 @@ class TestXher2k {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto alpha2 = T{args.alpha, args.alpha};
@ -91,21 +96,45 @@ class TestXher2k {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto alpha2 = T{args.alpha, args.alpha};
auto status = clblasXher2k(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k, alpha2,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto alpha2 = T{args.alpha, args.alpha};
auto status = clblasXher2k(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k, alpha2,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
auto alpha2 = T{args.alpha, args.alpha};
cblasXher2k(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
convertToCBLAS(args.a_transpose),
args.n, args.k, alpha2,
a_mat_cpu, args.a_offset, args.a_ld,
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
c_mat_cpu, args.c_offset, args.c_ld);
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -69,7 +74,7 @@ class TestXherk {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Herk(args.layout, args.triangle, args.a_transpose,
@ -82,19 +87,39 @@ class TestXherk {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
cblasXherk(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
convertToCBLAS(args.a_transpose),
args.n, args.k, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld, args.beta,
c_mat_cpu, args.c_offset, args.c_ld);
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -78,7 +83,7 @@ class TestXsymm {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Symm(args.layout, args.side, args.triangle,
@ -92,20 +97,43 @@ class TestXsymm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
static_cast<clblasSide>(args.side),
static_cast<clblasUplo>(args.triangle),
args.m, args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
static_cast<clblasSide>(args.side),
static_cast<clblasUplo>(args.triangle),
args.m, args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
cblasXsymm(convertToCBLAS(args.layout),
convertToCBLAS(args.side),
convertToCBLAS(args.triangle),
args.m, args.n, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
c_mat_cpu, args.c_offset, args.c_ld);
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -76,7 +81,7 @@ class TestXsyr2k {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
@ -90,20 +95,43 @@ class TestXsyr2k {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
cblasXsyr2k(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
convertToCBLAS(args.a_transpose),
args.n, args.k, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
c_mat_cpu, args.c_offset, args.c_ld);
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -69,7 +74,7 @@ class TestXsyrk {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syrk(args.layout, args.triangle, args.a_transpose,
@ -82,19 +87,39 @@ class TestXsyrk {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
buffers.c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
cblasXsyrk(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
convertToCBLAS(args.a_transpose),
args.n, args.k, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld, args.beta,
c_mat_cpu, args.c_offset, args.c_ld);
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

View file

@ -19,7 +19,12 @@
#include <vector>
#include <string>
#include "wrapper_clblas.h"
#ifdef CLBLAST_REF_CLBLAS
#include "wrapper_clblas.h"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "wrapper_cblas.h"
#endif
namespace clblast {
// =================================================================================================
@ -69,7 +74,7 @@ class TestXtrmm {
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
@ -82,21 +87,43 @@ class TestXtrmm {
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
static_cast<clblasSide>(args.side),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasDiag>(args.diagonal),
args.m, args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
static_cast<clblasSide>(args.side),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasDiag>(args.diagonal),
args.m, args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
cblasXtrmm(convertToCBLAS(args.layout),
convertToCBLAS(args.side),
convertToCBLAS(args.triangle),
convertToCBLAS(args.a_transpose),
convertToCBLAS(args.diagonal),
args.m, args.n, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
b_mat_cpu, args.b_offset, args.b_ld);
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {

1674
test/wrapper_cblas.h Normal file

File diff suppressed because it is too large Load diff

View file

@ -65,7 +65,7 @@ template <typename T>
clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
cl_mem sy1_buffer, const size_t sy1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
@ -73,7 +73,7 @@ template <>
clblasStatus clblasXrotmg<float>(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
cl_mem sy1_buffer, const size_t sy1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@ -88,7 +88,7 @@ template <>
clblasStatus clblasXrotmg<double>(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
cl_mem sy1_buffer, const size_t sy1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {