mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-08-27 07:17:00 +02:00
Merge branch 'cpu_blas' into development
This commit is contained in:
commit
2981ca4d3c
|
@ -2,6 +2,7 @@
|
||||||
Development version (next release)
|
Development version (next release)
|
||||||
- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter)
|
- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter)
|
||||||
- Made the library thread-safe
|
- Made the library thread-safe
|
||||||
|
- Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries
|
||||||
- Fixed the use of events within the library
|
- Fixed the use of events within the library
|
||||||
- Added level-1 routines:
|
- Added level-1 routines:
|
||||||
* SNRM2/DNRM2/ScNRM2/DzNRM2
|
* SNRM2/DNRM2/ScNRM2/DzNRM2
|
||||||
|
|
|
@ -66,7 +66,7 @@ else ()
|
||||||
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
|
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
|
||||||
endif()
|
endif()
|
||||||
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
||||||
set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
|
set(FLAGS "${FLAGS} -Wall -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
|
||||||
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
|
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
|
||||||
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
|
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
|
||||||
endif()
|
endif()
|
||||||
|
@ -98,11 +98,13 @@ if(TUNERS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Locates the clBLAS library in case the tests need to be compiled. "FindclBLAS.cmake" is included.
|
# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
|
||||||
|
# and "FindCBLAS.cmake" are included.
|
||||||
if(TESTS)
|
if(TESTS)
|
||||||
find_package(clBLAS)
|
find_package(clBLAS)
|
||||||
if(NOT CLBLAS_FOUND)
|
find_package(CBLAS)
|
||||||
message(STATUS "Could NOT find clBLAS, disabling the compilation of the tests")
|
if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
|
||||||
|
message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
|
||||||
set(TESTS OFF)
|
set(TESTS OFF)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -215,11 +217,33 @@ endif()
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Down from here is all test (performance and correctness) related. Note that these tests require
|
# Down from here is all test (performance and correctness) related. Note that these tests require
|
||||||
# the presence of the clBLAS library to act as a reference.
|
# the presence of clBLAS and/or a BLAS library to act as a reference.
|
||||||
if(TESTS)
|
if(TESTS)
|
||||||
|
|
||||||
# Adds new include directories for the reference clBLAS
|
# Sets the specifics for the reference BLAS libraries
|
||||||
include_directories(${clblast_SOURCE_DIR}/test ${CLBLAS_INCLUDE_DIRS})
|
set(REF_INCLUDES )
|
||||||
|
set(REF_LIBRARIES )
|
||||||
|
if(CLBLAS_FOUND)
|
||||||
|
set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
|
||||||
|
set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
|
||||||
|
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||||
|
add_definitions(" /DCLBLAST_REF_CLBLAS")
|
||||||
|
else()
|
||||||
|
add_definitions(" -DCLBLAST_REF_CLBLAS")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
if(CBLAS_FOUND)
|
||||||
|
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
|
||||||
|
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
|
||||||
|
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||||
|
add_definitions(" /DCLBLAST_REF_CBLAS")
|
||||||
|
else()
|
||||||
|
add_definitions(" -DCLBLAST_REF_CBLAS")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Sets the include directories
|
||||||
|
include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})
|
||||||
|
|
||||||
# Creates the common correctness-tests objects (requires CMake 2.8.8)
|
# Creates the common correctness-tests objects (requires CMake 2.8.8)
|
||||||
add_library(test_correctness_common OBJECT
|
add_library(test_correctness_common OBJECT
|
||||||
|
@ -239,7 +263,7 @@ if(TESTS)
|
||||||
test/correctness/routines/level3/${ROUTINE}.cc)
|
test/correctness/routines/level3/${ROUTINE}.cc)
|
||||||
endforeach()
|
endforeach()
|
||||||
foreach(ROUTINE ${ROUTINES})
|
foreach(ROUTINE ${ROUTINES})
|
||||||
target_link_libraries(clblast_test_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
|
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
|
||||||
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
|
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
|
@ -269,7 +293,7 @@ if(TESTS)
|
||||||
test/performance/routines/level3/${ROUTINE}.cc)
|
test/performance/routines/level3/${ROUTINE}.cc)
|
||||||
endforeach()
|
endforeach()
|
||||||
foreach(ROUTINE ${ROUTINES})
|
foreach(ROUTINE ${ROUTINES})
|
||||||
target_link_libraries(clblast_client_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
|
target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
|
||||||
install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
|
install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
|
|
13
README.md
13
README.md
|
@ -52,6 +52,14 @@ The pre-requisites for compilation of CLBlast are:
|
||||||
- Intel OpenCL
|
- Intel OpenCL
|
||||||
- Beignet
|
- Beignet
|
||||||
|
|
||||||
|
Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either:
|
||||||
|
|
||||||
|
* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD)
|
||||||
|
* A regular CPU Netlib BLAS library, e.g.:
|
||||||
|
- OpenBLAS
|
||||||
|
- BLIS
|
||||||
|
- Accelerate
|
||||||
|
|
||||||
An example of an out-of-source build (starting from the root of the CLBlast folder):
|
An example of an out-of-source build (starting from the root of the CLBlast folder):
|
||||||
|
|
||||||
mkdir build
|
mkdir build
|
||||||
|
@ -135,9 +143,9 @@ To make sure CLBlast is working correctly on your device (recommended), compile
|
||||||
|
|
||||||
cmake -DTESTS=ON ..
|
cmake -DTESTS=ON ..
|
||||||
|
|
||||||
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. The library clBLAS is therefore required to be installed on your system for the CLBlast tests.
|
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against.
|
||||||
|
|
||||||
With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test.
|
With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library.
|
||||||
|
|
||||||
|
|
||||||
Performance remarks
|
Performance remarks
|
||||||
|
@ -249,4 +257,3 @@ To-do list before release of version 1.0
|
||||||
- Support all routines supported by clBLAS
|
- Support all routines supported by clBLAS
|
||||||
- Allow the user control over events and synchronization
|
- Allow the user control over events and synchronization
|
||||||
- Add half-precision routines (e.g. HGEMM)
|
- Add half-precision routines (e.g. HGEMM)
|
||||||
- Enable correctness and performance testing against a CPU-based BLAS library
|
|
||||||
|
|
75
cmake/Modules/FindCBLAS.cmake
Normal file
75
cmake/Modules/FindCBLAS.cmake
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
|
||||||
|
# ==================================================================================================
|
||||||
|
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
# width of 100 characters per line.
|
||||||
|
#
|
||||||
|
# Author(s):
|
||||||
|
# Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
#
|
||||||
|
# ==================================================================================================
|
||||||
|
#
|
||||||
|
# Defines the following variables:
|
||||||
|
# CBLAS_FOUND Boolean holding whether or not the Netlib BLAS library was found
|
||||||
|
# CBLAS_INCLUDE_DIRS The Netlib BLAS include directory
|
||||||
|
# CBLAS_LIBRARIES The Netlib BLAS library
|
||||||
|
#
|
||||||
|
# In case BLAS is not installed in the default directory, set the CBLAS_ROOT variable to point to
|
||||||
|
# the root of BLAS, such that 'cblas.h' can be found in $CBLAS_ROOT/include. This can either be
|
||||||
|
# done using an environmental variable (e.g. export CBLAS_ROOT=/path/to/BLAS) or using a CMake
|
||||||
|
# variable (e.g. cmake -DCBLAS_ROOT=/path/to/BLAS ..).
|
||||||
|
#
|
||||||
|
# ==================================================================================================
|
||||||
|
|
||||||
|
# Sets the possible install locations
|
||||||
|
set(CBLAS_HINTS
|
||||||
|
${CBLAS_ROOT}
|
||||||
|
$ENV{CBLAS_ROOT}
|
||||||
|
)
|
||||||
|
set(CBLAS_PATHS
|
||||||
|
/usr
|
||||||
|
/usr/local
|
||||||
|
/usr/local/opt
|
||||||
|
/System/Library/Frameworks
|
||||||
|
)
|
||||||
|
|
||||||
|
# Finds the include directories
|
||||||
|
find_path(CBLAS_INCLUDE_DIRS
|
||||||
|
NAMES cblas.h
|
||||||
|
HINTS ${CBLAS_HINTS}
|
||||||
|
PATH_SUFFIXES
|
||||||
|
include inc include/x86_64 include/x64
|
||||||
|
openblas/include include/blis blis/include blis/include/blis
|
||||||
|
Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers
|
||||||
|
PATHS ${CBLAS_PATHS}
|
||||||
|
DOC "Netlib BLAS include header cblas.h"
|
||||||
|
)
|
||||||
|
mark_as_advanced(CBLAS_INCLUDE_DIRS)
|
||||||
|
|
||||||
|
# Finds the library
|
||||||
|
find_library(CBLAS_LIBRARIES
|
||||||
|
NAMES blas mkl blis openblas atlas accelerate
|
||||||
|
HINTS ${CBLAS_HINTS}
|
||||||
|
PATH_SUFFIXES
|
||||||
|
lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
|
||||||
|
openblas/lib blis/lib
|
||||||
|
PATHS ${CBLAS_PATHS}
|
||||||
|
DOC "Netlib BLAS library"
|
||||||
|
)
|
||||||
|
mark_as_advanced(CBLAS_LIBRARIES)
|
||||||
|
|
||||||
|
# ==================================================================================================
|
||||||
|
|
||||||
|
# Notification messages
|
||||||
|
if(NOT CBLAS_INCLUDE_DIRS)
|
||||||
|
message(STATUS "Could NOT find 'cblas.h', install a CPU Netlib BLAS or set CBLAS_ROOT")
|
||||||
|
endif()
|
||||||
|
if(NOT CBLAS_LIBRARIES)
|
||||||
|
message(STATUS "Could NOT find a CPU Netlib BLAS library, install it or set CBLAS_ROOT")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Determines whether or not BLAS was found
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES)
|
||||||
|
|
||||||
|
# ==================================================================================================
|
|
@ -100,7 +100,7 @@ template <typename T>
|
||||||
StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
cl_mem sd2_buffer, const size_t sd2_offset,
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
cl_mem sx1_buffer, const size_t sx1_offset,
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
cl_mem sy1_buffer, const size_t sy1_offset,
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
cl_mem sparam_buffer, const size_t sparam_offset,
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
cl_command_queue* queue, cl_event* event = nullptr);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
|
|
|
@ -112,13 +112,13 @@ StatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
|
||||||
StatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
StatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
cl_mem sd2_buffer, const size_t sd2_offset,
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
cl_mem sx1_buffer, const size_t sx1_offset,
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
cl_mem sy1_buffer, const size_t sy1_offset,
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
cl_mem sparam_buffer, const size_t sparam_offset,
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event);
|
||||||
StatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
StatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
cl_mem sd2_buffer, const size_t sd2_offset,
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
cl_mem sx1_buffer, const size_t sx1_offset,
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
cl_mem sy1_buffer, const size_t sy1_offset,
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
cl_mem sparam_buffer, const size_t sparam_offset,
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event);
|
||||||
|
|
||||||
|
|
|
@ -465,31 +465,33 @@ class Buffer {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copies from device to host: reading the device buffer a-synchronously
|
// Copies from device to host: reading the device buffer a-synchronously
|
||||||
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
|
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
|
||||||
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
|
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
|
||||||
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
|
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
|
||||||
host, 0, nullptr, nullptr));
|
host, 0, nullptr, nullptr));
|
||||||
}
|
}
|
||||||
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
|
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
|
||||||
const size_t offset = 0) {
|
const size_t offset = 0) const {
|
||||||
if (host.size() < size) { Error("target host buffer is too small"); }
|
if (host.size() < size) { Error("target host buffer is too small"); }
|
||||||
ReadAsync(queue, size, host.data(), offset);
|
ReadAsync(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
|
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
|
||||||
const size_t offset = 0) {
|
const size_t offset = 0) const {
|
||||||
if (host.size() < size) { Error("target host buffer is too small"); }
|
if (host.size() < size) { Error("target host buffer is too small"); }
|
||||||
ReadAsync(queue, size, host.data(), offset);
|
ReadAsync(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copies from device to host: reading the device buffer
|
// Copies from device to host: reading the device buffer
|
||||||
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
|
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
|
||||||
ReadAsync(queue, size, host, offset);
|
ReadAsync(queue, size, host, offset);
|
||||||
queue.Finish();
|
queue.Finish();
|
||||||
}
|
}
|
||||||
void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
|
void Read(const Queue &queue, const size_t size, std::vector<T> &host,
|
||||||
|
const size_t offset = 0) const {
|
||||||
Read(queue, size, host.data(), offset);
|
Read(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
|
void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
|
||||||
|
const size_t offset = 0) const {
|
||||||
Read(queue, size, host.data(), offset);
|
Read(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -35,6 +35,9 @@ using double2 = std::complex<double>;
|
||||||
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
|
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
|
||||||
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
|
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
|
||||||
|
|
||||||
|
// Catched an unknown error
|
||||||
|
constexpr auto kUnknownError = -999;
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// The routine-specific arguments in string form
|
// The routine-specific arguments in string form
|
||||||
|
@ -70,6 +73,7 @@ constexpr auto kArgFraction = "fraction";
|
||||||
|
|
||||||
// The client-specific arguments in string form
|
// The client-specific arguments in string form
|
||||||
constexpr auto kArgCompareclblas = "clblas";
|
constexpr auto kArgCompareclblas = "clblas";
|
||||||
|
constexpr auto kArgComparecblas = "cblas";
|
||||||
constexpr auto kArgStepSize = "step";
|
constexpr auto kArgStepSize = "step";
|
||||||
constexpr auto kArgNumSteps = "num_steps";
|
constexpr auto kArgNumSteps = "num_steps";
|
||||||
constexpr auto kArgNumRuns = "runs";
|
constexpr auto kArgNumRuns = "runs";
|
||||||
|
@ -128,6 +132,7 @@ struct Arguments {
|
||||||
double fraction = 1.0;
|
double fraction = 1.0;
|
||||||
// Client-specific arguments
|
// Client-specific arguments
|
||||||
int compare_clblas = 1;
|
int compare_clblas = 1;
|
||||||
|
int compare_cblas = 1;
|
||||||
size_t step = 1;
|
size_t step = 1;
|
||||||
size_t num_steps = 0;
|
size_t num_steps = 0;
|
||||||
size_t num_runs = 10;
|
size_t num_runs = 10;
|
||||||
|
|
|
@ -58,5 +58,10 @@ class DataType():
|
||||||
return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
|
return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
|
||||||
return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp
|
return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp
|
||||||
|
|
||||||
|
# Current scalar is complex
|
||||||
|
def IsComplex(self, scalar):
|
||||||
|
return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or
|
||||||
|
(scalar == "beta" and self.beta_cpp in [FLT2, DBL2]))
|
||||||
|
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
|
@ -8,12 +8,13 @@
|
||||||
# Cedric Nugteren <www.cedricnugteren.nl>
|
# Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
#
|
#
|
||||||
# This script automatically generates the bodies of the following files, creating the full CLBlast
|
# This script automatically generates the bodies of the following files, creating the full CLBlast
|
||||||
# API interface and implementation (C, C++, and clBLAS wrapper):
|
# API interface and implementation (C, C++, and reference BLAS wrappers):
|
||||||
# clblast.h
|
# clblast.h
|
||||||
# clblast.cc
|
# clblast.cc
|
||||||
# clblast_c.h
|
# clblast_c.h
|
||||||
# clblast_c.cc
|
# clblast_c.cc
|
||||||
# wrapper_clblas.h
|
# wrapper_clblas.h
|
||||||
|
# wrapper_cblas.h
|
||||||
# It also generates the main functions for the correctness and performance tests as found in
|
# It also generates the main functions for the correctness and performance tests as found in
|
||||||
# test/correctness/routines/levelX/xYYYY.cc
|
# test/correctness/routines/levelX/xYYYY.cc
|
||||||
# test/performance/routines/levelX/xYYYY.cc
|
# test/performance/routines/levelX/xYYYY.cc
|
||||||
|
@ -55,7 +56,7 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T")
|
||||||
routines = [
|
routines = [
|
||||||
[ # Level 1: vector-vector
|
[ # Level 1: vector-vector
|
||||||
Routine(False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"),
|
Routine(False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"),
|
||||||
Routine(False, "1", "rotmg", T, [S,D], [], [], [], ["sd1","sd2","sx1","sy1","sparam"], [], "", "Generate modified givens plane rotation"),
|
Routine(False, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation"),
|
||||||
Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"),
|
Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"),
|
||||||
Routine(False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"),
|
Routine(False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"),
|
||||||
Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"),
|
Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"),
|
||||||
|
@ -220,11 +221,11 @@ def wrapper_clblas(routines):
|
||||||
for routine in routines:
|
for routine in routines:
|
||||||
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
|
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
|
||||||
if routine.NoScalars():
|
if routine.NoScalars():
|
||||||
result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n"
|
result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
|
||||||
for flavour in routine.flavours:
|
for flavour in routine.flavours:
|
||||||
indent = " "*(17 + routine.Length())
|
indent = " "*(17 + routine.Length())
|
||||||
result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n"
|
result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
|
||||||
arguments = routine.ArgumentsWrapper(flavour)
|
arguments = routine.ArgumentsWrapperCL(flavour)
|
||||||
if routine.scratch:
|
if routine.scratch:
|
||||||
result += " auto queue = Queue(queues[0]);\n"
|
result += " auto queue = Queue(queues[0]);\n"
|
||||||
result += " auto context = queue.GetContext();\n"
|
result += " auto context = queue.GetContext();\n"
|
||||||
|
@ -236,6 +237,41 @@ def wrapper_clblas(routines):
|
||||||
result += "\n}\n"
|
result += "\n}\n"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
# The wrapper to the reference CBLAS routines (for performance/correctness testing)
|
||||||
|
def wrapper_cblas(routines):
|
||||||
|
result = ""
|
||||||
|
for routine in routines:
|
||||||
|
result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames())
|
||||||
|
for flavour in routine.flavours:
|
||||||
|
indent = " "*(10 + routine.Length())
|
||||||
|
result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
|
||||||
|
arguments = routine.ArgumentsWrapperC(flavour)
|
||||||
|
|
||||||
|
# Double-precision scalars
|
||||||
|
for scalar in routine.scalars:
|
||||||
|
if flavour.IsComplex(scalar):
|
||||||
|
result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
|
||||||
|
|
||||||
|
# Special case for scalar outputs
|
||||||
|
assignment = ""
|
||||||
|
postfix = ""
|
||||||
|
extra_argument = ""
|
||||||
|
for output_buffer in routine.outputs:
|
||||||
|
if output_buffer in routine.ScalarBuffersFirst():
|
||||||
|
if flavour in [C,Z]:
|
||||||
|
postfix += "_sub"
|
||||||
|
indent += " "
|
||||||
|
extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
|
||||||
|
else:
|
||||||
|
assignment = output_buffer+"_buffer["+output_buffer+"_offset] = "
|
||||||
|
indent += " "*len(assignment)
|
||||||
|
|
||||||
|
result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
|
||||||
|
result += (",\n"+indent).join([a for a in arguments])
|
||||||
|
result += extra_argument+");"
|
||||||
|
result += "\n}\n"
|
||||||
|
return result
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Checks for the number of command-line arguments
|
# Checks for the number of command-line arguments
|
||||||
|
@ -251,9 +287,10 @@ files = [
|
||||||
path_clblast+"/include/clblast_c.h",
|
path_clblast+"/include/clblast_c.h",
|
||||||
path_clblast+"/src/clblast_c.cc",
|
path_clblast+"/src/clblast_c.cc",
|
||||||
path_clblast+"/test/wrapper_clblas.h",
|
path_clblast+"/test/wrapper_clblas.h",
|
||||||
|
path_clblast+"/test/wrapper_cblas.h",
|
||||||
]
|
]
|
||||||
header_lines = [84, 65, 93, 22, 22]
|
header_lines = [84, 65, 93, 22, 22, 38]
|
||||||
footer_lines = [6, 3, 9, 2, 6]
|
footer_lines = [6, 3, 9, 2, 6, 6]
|
||||||
|
|
||||||
# Checks whether the command-line arguments are valid; exists otherwise
|
# Checks whether the command-line arguments are valid; exists otherwise
|
||||||
for f in files:
|
for f in files:
|
||||||
|
@ -287,6 +324,8 @@ for i in xrange(0,len(files)):
|
||||||
body += clblast_c_cc(routines[level-1])
|
body += clblast_c_cc(routines[level-1])
|
||||||
if i == 4:
|
if i == 4:
|
||||||
body += wrapper_clblas(routines[level-1])
|
body += wrapper_clblas(routines[level-1])
|
||||||
|
if i == 5:
|
||||||
|
body += wrapper_cblas(routines[level-1])
|
||||||
f.write("".join(file_header))
|
f.write("".join(file_header))
|
||||||
f.write(body)
|
f.write(body)
|
||||||
f.write("".join(file_footer))
|
f.write("".join(file_footer))
|
||||||
|
|
|
@ -28,7 +28,7 @@ def OptionToCLBlast(x):
|
||||||
}[x]
|
}[x]
|
||||||
|
|
||||||
# As above, but for clBLAS data-types
|
# As above, but for clBLAS data-types
|
||||||
def OptionToWrapper(x):
|
def OptionToWrapperCL(x):
|
||||||
return {
|
return {
|
||||||
'layout': "clblasOrder",
|
'layout': "clblasOrder",
|
||||||
'a_transpose': "clblasTranspose",
|
'a_transpose': "clblasTranspose",
|
||||||
|
@ -39,6 +39,18 @@ def OptionToWrapper(x):
|
||||||
'diagonal': "clblasDiag",
|
'diagonal': "clblasDiag",
|
||||||
}[x]
|
}[x]
|
||||||
|
|
||||||
|
# As above, but for CBLAS data-types
|
||||||
|
def OptionToWrapperC(x):
|
||||||
|
return {
|
||||||
|
'layout': "CBLAS_ORDER",
|
||||||
|
'a_transpose': "CBLAS_TRANSPOSE",
|
||||||
|
'b_transpose': "CBLAS_TRANSPOSE",
|
||||||
|
'ab_transpose': "CBLAS_TRANSPOSE",
|
||||||
|
'side': "CBLAS_SIDE",
|
||||||
|
'triangle': "CBLAS_UPLO",
|
||||||
|
'diagonal': "CBLAS_DIAG",
|
||||||
|
}[x]
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Class holding routine-specific information (e.g. name, which arguments, which precisions)
|
# Class holding routine-specific information (e.g. name, which arguments, which precisions)
|
||||||
|
@ -119,6 +131,16 @@ class Routine():
|
||||||
return [", ".join(a+b+c)]
|
return [", ".join(a+b+c)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# As above but as vectors
|
||||||
|
def BufferDefVector(self, name, flavour):
|
||||||
|
prefix = "const " if (name in self.inputs) else ""
|
||||||
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
|
a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"]
|
||||||
|
b = ["const size_t "+name+"_offset"]
|
||||||
|
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
|
||||||
|
return [", ".join(a+b+c)]
|
||||||
|
return []
|
||||||
|
|
||||||
# As above but with Claduc buffers
|
# As above but with Claduc buffers
|
||||||
def BufferCladuc(self, name):
|
def BufferCladuc(self, name):
|
||||||
if (name in self.inputs) or (name in self.outputs):
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
|
@ -129,7 +151,7 @@ class Routine():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# As above but with a static cast for clBLAS wrapper
|
# As above but with a static cast for clBLAS wrapper
|
||||||
def BufferWrapper(self, name):
|
def BufferWrapperCL(self, name):
|
||||||
if (name in self.inputs) or (name in self.outputs):
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
a = [name+"_buffer"]
|
a = [name+"_buffer"]
|
||||||
b = [name+"_offset"]
|
b = [name+"_offset"]
|
||||||
|
@ -141,6 +163,24 @@ class Routine():
|
||||||
return [", ".join(a+b+c)]
|
return [", ".join(a+b+c)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# As above but with a static cast for CBLAS wrapper
|
||||||
|
def BufferWrapperC(self, name, flavour):
|
||||||
|
prefix = "const " if (name in self.inputs) else ""
|
||||||
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
|
if name == "sy1":
|
||||||
|
a = [name+"_buffer["+name+"_offset]"]
|
||||||
|
elif flavour.precision_name in ["C","Z"]:
|
||||||
|
a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"]
|
||||||
|
else:
|
||||||
|
a = ["&"+name+"_buffer["+name+"_offset]"]
|
||||||
|
c = []
|
||||||
|
if (name in ["x","y"]):
|
||||||
|
c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
|
||||||
|
elif (name in ["a","b","c"]):
|
||||||
|
c = [name+"_"+self.Postfix(name)]
|
||||||
|
return [", ".join(a+c)]
|
||||||
|
return []
|
||||||
|
|
||||||
# As above, but only data-types
|
# As above, but only data-types
|
||||||
def BufferType(self, name):
|
def BufferType(self, name):
|
||||||
prefix = "const " if (name in self.inputs) else ""
|
prefix = "const " if (name in self.inputs) else ""
|
||||||
|
@ -179,6 +219,14 @@ class Routine():
|
||||||
return [name]
|
return [name]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Retrieves the use of a scalar for CBLAS (alpha/beta)
|
||||||
|
def ScalarUseWrapperC(self, name, flavour):
|
||||||
|
if name in self.scalars:
|
||||||
|
if flavour.IsComplex(name):
|
||||||
|
return [name+"_array.data()"]
|
||||||
|
return [name]
|
||||||
|
return []
|
||||||
|
|
||||||
# Retrieves the definition of a scalar (alpha/beta)
|
# Retrieves the definition of a scalar (alpha/beta)
|
||||||
def ScalarDef(self, name, flavour):
|
def ScalarDef(self, name, flavour):
|
||||||
if name in self.scalars:
|
if name in self.scalars:
|
||||||
|
@ -246,9 +294,16 @@ class Routine():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# As above, but now using clBLAS data-types
|
# As above, but now using clBLAS data-types
|
||||||
def OptionsDefWrapper(self):
|
def OptionsDefWrapperCL(self):
|
||||||
if self.options:
|
if self.options:
|
||||||
definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options]
|
definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options]
|
||||||
|
return [", ".join(definitions)]
|
||||||
|
return []
|
||||||
|
|
||||||
|
# As above, but now using CBLAS data-types
|
||||||
|
def OptionsDefWrapperC(self):
|
||||||
|
if self.options:
|
||||||
|
definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options]
|
||||||
return [", ".join(definitions)]
|
return [", ".join(definitions)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
@ -284,16 +339,26 @@ class Routine():
|
||||||
list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()])))
|
list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
# As above, but for the clBLAS wrapper
|
# As above, but for the clBLAS wrapper
|
||||||
def ArgumentsWrapper(self, flavour):
|
def ArgumentsWrapperCL(self, flavour):
|
||||||
return (self.Options() + self.Sizes() +
|
return (self.Options() + self.Sizes() +
|
||||||
list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersFirst()])) +
|
list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) +
|
||||||
self.ScalarUseWrapper("alpha", flavour) +
|
self.ScalarUseWrapper("alpha", flavour) +
|
||||||
list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) +
|
list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) +
|
||||||
self.ScalarUseWrapper("beta", flavour) +
|
self.ScalarUseWrapper("beta", flavour) +
|
||||||
list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) +
|
list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) +
|
||||||
list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersSecond()])) +
|
list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) +
|
||||||
list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()])))
|
list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
|
# As above, but for the CBLAS wrapper
|
||||||
|
def ArgumentsWrapperC(self, flavour):
|
||||||
|
return (self.Options() + self.Sizes() +
|
||||||
|
self.ScalarUseWrapperC("alpha", flavour) +
|
||||||
|
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) +
|
||||||
|
self.ScalarUseWrapperC("beta", flavour) +
|
||||||
|
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) +
|
||||||
|
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
# Retrieves a combination of all the argument definitions
|
# Retrieves a combination of all the argument definitions
|
||||||
def ArgumentsDef(self, flavour):
|
def ArgumentsDef(self, flavour):
|
||||||
return (self.OptionsDef() + self.SizesDef() +
|
return (self.OptionsDef() + self.SizesDef() +
|
||||||
|
@ -306,8 +371,8 @@ class Routine():
|
||||||
list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()])))
|
list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
# As above, but clBLAS wrapper plain datatypes
|
# As above, but clBLAS wrapper plain datatypes
|
||||||
def ArgumentsDefWrapper(self, flavour):
|
def ArgumentsDefWrapperCL(self, flavour):
|
||||||
return (self.OptionsDefWrapper() + self.SizesDef() +
|
return (self.OptionsDefWrapperCL() + self.SizesDef() +
|
||||||
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
|
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
|
||||||
self.ScalarDefPlain("alpha", flavour) +
|
self.ScalarDefPlain("alpha", flavour) +
|
||||||
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
|
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
|
||||||
|
@ -316,6 +381,17 @@ class Routine():
|
||||||
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
|
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
|
||||||
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
|
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
|
# As above, but CBLAS wrapper plain datatypes
|
||||||
|
def ArgumentsDefWrapperC(self, flavour):
|
||||||
|
return (self.OptionsDefWrapperC() + self.SizesDef() +
|
||||||
|
list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) +
|
||||||
|
self.ScalarDefPlain("alpha", flavour) +
|
||||||
|
list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) +
|
||||||
|
self.ScalarDefPlain("beta", flavour) +
|
||||||
|
list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) +
|
||||||
|
list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
# Retrieves a combination of all the argument types
|
# Retrieves a combination of all the argument types
|
||||||
def ArgumentsType(self, flavour):
|
def ArgumentsType(self, flavour):
|
||||||
return (self.OptionsType() + self.SizesType() +
|
return (self.OptionsType() + self.SizesType() +
|
||||||
|
@ -356,7 +432,7 @@ class Routine():
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# As above, but now for the clBLAS wrapper
|
# As above, but now for the clBLAS wrapper
|
||||||
def RoutineHeaderWrapper(self, flavour, def_only, spaces):
|
def RoutineHeaderWrapperCL(self, flavour, def_only, spaces):
|
||||||
template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
|
template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
|
||||||
indent = " "*(spaces + self.Length() + len(template))
|
indent = " "*(spaces + self.Length() + len(template))
|
||||||
result = ""
|
result = ""
|
||||||
|
@ -366,9 +442,16 @@ class Routine():
|
||||||
result += flavour.name
|
result += flavour.name
|
||||||
result += ">\n"
|
result += ">\n"
|
||||||
result += "clblasStatus clblasX"+self.name+template+"("
|
result += "clblasStatus clblasX"+self.name+template+"("
|
||||||
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)])
|
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)])
|
||||||
result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
|
result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
|
||||||
result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
|
result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
# As above, but now for the CBLAS wrapper
|
||||||
|
def RoutineHeaderWrapperC(self, flavour, def_only, spaces):
|
||||||
|
indent = " "*(spaces + self.Length())
|
||||||
|
result = "void cblasX"+self.name+"("
|
||||||
|
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")"
|
||||||
|
return result
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
|
@ -93,7 +93,7 @@ template <typename T>
|
||||||
StatusCode Rotmg(cl_mem, const size_t,
|
StatusCode Rotmg(cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
const cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
cl_mem, const size_t,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue*, cl_event*) {
|
||||||
return StatusCode::kNotImplemented;
|
return StatusCode::kNotImplemented;
|
||||||
|
@ -101,13 +101,13 @@ StatusCode Rotmg(cl_mem, const size_t,
|
||||||
template StatusCode PUBLIC_API Rotmg<float>(cl_mem, const size_t,
|
template StatusCode PUBLIC_API Rotmg<float>(cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
const cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
cl_mem, const size_t,
|
||||||
cl_command_queue*, cl_event*);
|
cl_command_queue*, cl_event*);
|
||||||
template StatusCode PUBLIC_API Rotmg<double>(cl_mem, const size_t,
|
template StatusCode PUBLIC_API Rotmg<double>(cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
const cl_mem, const size_t,
|
||||||
cl_mem, const size_t,
|
cl_mem, const size_t,
|
||||||
cl_command_queue*, cl_event*);
|
cl_command_queue*, cl_event*);
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,7 @@ StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
|
||||||
StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
cl_mem sd2_buffer, const size_t sd2_offset,
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
cl_mem sx1_buffer, const size_t sx1_offset,
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
cl_mem sy1_buffer, const size_t sy1_offset,
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
cl_mem sparam_buffer, const size_t sparam_offset,
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
cl_command_queue* queue, cl_event* event) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
auto status = clblast::Rotmg<float>(sd1_buffer, sd1_offset,
|
auto status = clblast::Rotmg<float>(sd1_buffer, sd1_offset,
|
||||||
|
@ -69,7 +69,7 @@ StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
cl_mem sd2_buffer, const size_t sd2_offset,
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
cl_mem sx1_buffer, const size_t sx1_offset,
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
cl_mem sy1_buffer, const size_t sy1_offset,
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
cl_mem sparam_buffer, const size_t sparam_offset,
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
cl_command_queue* queue, cl_event* event) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
auto status = clblast::Rotmg<double>(sd1_buffer, sd1_offset,
|
auto status = clblast::Rotmg<double>(sd1_buffer, sd1_offset,
|
||||||
|
|
|
@ -79,24 +79,6 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
|
||||||
// Iterates over all the to-be-tested combinations of arguments
|
// Iterates over all the to-be-tested combinations of arguments
|
||||||
for (auto &args: test_vector) {
|
for (auto &args: test_vector) {
|
||||||
|
|
||||||
// Runs the reference clBLAS code
|
|
||||||
auto x_vec1 = Buffer<T>(context_, args.x_size);
|
|
||||||
auto y_vec1 = Buffer<T>(context_, args.y_size);
|
|
||||||
auto a_mat1 = Buffer<T>(context_, args.a_size);
|
|
||||||
auto b_mat1 = Buffer<T>(context_, args.b_size);
|
|
||||||
auto c_mat1 = Buffer<T>(context_, args.c_size);
|
|
||||||
auto ap_mat1 = Buffer<T>(context_, args.ap_size);
|
|
||||||
auto scalar1 = Buffer<T>(context_, args.scalar_size);
|
|
||||||
x_vec1.Write(queue_, args.x_size, x_source_);
|
|
||||||
y_vec1.Write(queue_, args.y_size, y_source_);
|
|
||||||
a_mat1.Write(queue_, args.a_size, a_source_);
|
|
||||||
b_mat1.Write(queue_, args.b_size, b_source_);
|
|
||||||
c_mat1.Write(queue_, args.c_size, c_source_);
|
|
||||||
ap_mat1.Write(queue_, args.ap_size, ap_source_);
|
|
||||||
scalar1.Write(queue_, args.scalar_size, scalar_source_);
|
|
||||||
auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
|
|
||||||
auto status1 = run_reference_(args, buffers1, queue_);
|
|
||||||
|
|
||||||
// Runs the CLBlast code
|
// Runs the CLBlast code
|
||||||
auto x_vec2 = Buffer<T>(context_, args.x_size);
|
auto x_vec2 = Buffer<T>(context_, args.x_size);
|
||||||
auto y_vec2 = Buffer<T>(context_, args.y_size);
|
auto y_vec2 = Buffer<T>(context_, args.y_size);
|
||||||
|
@ -115,6 +97,33 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
|
||||||
auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
|
auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
|
||||||
auto status2 = run_routine_(args, buffers2, queue_);
|
auto status2 = run_routine_(args, buffers2, queue_);
|
||||||
|
|
||||||
|
#ifndef CLBLAST_REF_CLBLAS
|
||||||
|
// Don't continue with CBLAS if there are incorrect parameters
|
||||||
|
if (status2 != StatusCode::kSuccess) {
|
||||||
|
// TODO: Mark this as a skipped test instead of a succesfull test
|
||||||
|
TestErrorCodes(status2, status2, args);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Runs the reference BLAS code
|
||||||
|
auto x_vec1 = Buffer<T>(context_, args.x_size);
|
||||||
|
auto y_vec1 = Buffer<T>(context_, args.y_size);
|
||||||
|
auto a_mat1 = Buffer<T>(context_, args.a_size);
|
||||||
|
auto b_mat1 = Buffer<T>(context_, args.b_size);
|
||||||
|
auto c_mat1 = Buffer<T>(context_, args.c_size);
|
||||||
|
auto ap_mat1 = Buffer<T>(context_, args.ap_size);
|
||||||
|
auto scalar1 = Buffer<T>(context_, args.scalar_size);
|
||||||
|
x_vec1.Write(queue_, args.x_size, x_source_);
|
||||||
|
y_vec1.Write(queue_, args.y_size, y_source_);
|
||||||
|
a_mat1.Write(queue_, args.a_size, a_source_);
|
||||||
|
b_mat1.Write(queue_, args.b_size, b_source_);
|
||||||
|
c_mat1.Write(queue_, args.c_size, c_source_);
|
||||||
|
ap_mat1.Write(queue_, args.ap_size, ap_source_);
|
||||||
|
scalar1.Write(queue_, args.scalar_size, scalar_source_);
|
||||||
|
auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
|
||||||
|
auto status1 = run_reference_(args, buffers1, queue_);
|
||||||
|
|
||||||
// Tests for equality of the two status codes
|
// Tests for equality of the two status codes
|
||||||
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
|
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
|
||||||
TestErrorCodes(status1, status2, args);
|
TestErrorCodes(status1, status2, args);
|
||||||
|
|
|
@ -68,7 +68,7 @@ class TestBlas: public Tester<T,U> {
|
||||||
static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
|
static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
|
||||||
|
|
||||||
// Shorthand for the routine-specific functions passed to the tester
|
// Shorthand for the routine-specific functions passed to the tester
|
||||||
using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers<T>&, Queue&)>;
|
using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
|
||||||
using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers<T>&, Queue&)>;
|
using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers<T>&, Queue&)>;
|
||||||
using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
|
using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
|
||||||
using ResultIterator = std::function<size_t(const Arguments<U>&)>;
|
using ResultIterator = std::function<size_t(const Arguments<U>&)>;
|
||||||
|
@ -76,8 +76,9 @@ class TestBlas: public Tester<T,U> {
|
||||||
// Constructor, initializes the base class tester and input data
|
// Constructor, initializes the base class tester and input data
|
||||||
TestBlas(int argc, char *argv[], const bool silent,
|
TestBlas(int argc, char *argv[], const bool silent,
|
||||||
const std::string &name, const std::vector<std::string> &options,
|
const std::string &name, const std::vector<std::string> &options,
|
||||||
const Routine run_routine, const Routine run_reference, const ResultGet get_result,
|
const Routine run_routine, const Routine run_reference,
|
||||||
const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2);
|
const ResultGet get_result, const ResultIndex get_index,
|
||||||
|
const ResultIterator get_id1, const ResultIterator get_id2);
|
||||||
|
|
||||||
// The test functions, taking no inputs
|
// The test functions, taking no inputs
|
||||||
void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name);
|
void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name);
|
||||||
|
@ -110,9 +111,17 @@ class TestBlas: public Tester<T,U> {
|
||||||
template <typename C, typename T, typename U>
|
template <typename C, typename T, typename U>
|
||||||
void RunTests(int argc, char *argv[], const bool silent, const std::string &name) {
|
void RunTests(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Sets the reference to test against
|
||||||
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
const auto reference_routine = C::RunReference1; // clBLAS when available
|
||||||
|
#else
|
||||||
|
const auto reference_routine = C::RunReference2; // otherwise CBLAS
|
||||||
|
#endif
|
||||||
|
|
||||||
// Creates a tester
|
// Creates a tester
|
||||||
auto options = C::GetOptions();
|
auto options = C::GetOptions();
|
||||||
TestBlas<T,U> tester{argc, argv, silent, name, options, C::RunRoutine, C::RunReference,
|
TestBlas<T,U> tester{argc, argv, silent, name, options,
|
||||||
|
C::RunRoutine, reference_routine,
|
||||||
C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2};
|
C::DownloadResult, C::GetResultIndex, C::ResultID1, C::ResultID2};
|
||||||
|
|
||||||
// This variable holds the arguments relevant for this routine
|
// This variable holds the arguments relevant for this routine
|
||||||
|
@ -250,6 +259,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
|
||||||
}
|
}
|
||||||
|
|
||||||
// Creates the arguments vector for the invalid-buffer tests
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
auto invalid_test_vector = std::vector<Arguments<U>>{};
|
auto invalid_test_vector = std::vector<Arguments<U>>{};
|
||||||
auto i_args = args;
|
auto i_args = args;
|
||||||
i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
|
i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
|
||||||
|
@ -267,6 +277,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Sets the name of this test-case
|
// Sets the name of this test-case
|
||||||
auto names = std::vector<std::string>{};
|
auto names = std::vector<std::string>{};
|
||||||
|
@ -287,7 +298,9 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
|
||||||
|
|
||||||
// Runs the tests
|
// Runs the tests
|
||||||
tester.TestRegular(regular_test_vector, case_name);
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
tester.TestInvalid(invalid_test_vector, case_name);
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,10 +69,12 @@ Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
|
||||||
kUnsupportedPrecision.c_str());
|
kUnsupportedPrecision.c_str());
|
||||||
|
|
||||||
// Initializes clBLAS
|
// Initializes clBLAS
|
||||||
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
auto status = clblasSetup();
|
auto status = clblasSetup();
|
||||||
if (status != CL_SUCCESS) {
|
if (status != CL_SUCCESS) {
|
||||||
throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status)));
|
throw std::runtime_error("clBLAS setup error: "+ToString(static_cast<int>(status)));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Destructor prints the summary of the test cases and cleans-up the clBLAS library
|
// Destructor prints the summary of the test cases and cleans-up the clBLAS library
|
||||||
|
@ -87,7 +89,11 @@ Tester<T,U>::~Tester() {
|
||||||
fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
|
fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
|
|
||||||
|
// Cleans-up clBLAS
|
||||||
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
clblasTeardown();
|
clblasTeardown();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -23,7 +23,9 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
// The libraries
|
// The libraries
|
||||||
#include <clBLAS.h>
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include <clBLAS.h>
|
||||||
|
#endif
|
||||||
#include "clblast.h"
|
#include "clblast.h"
|
||||||
|
|
||||||
#include "internal/utilities.h"
|
#include "internal/utilities.h"
|
||||||
|
@ -92,7 +94,7 @@ class Tester {
|
||||||
Queue queue_;
|
Queue queue_;
|
||||||
|
|
||||||
// Whether or not to run the full test-suite or just a smoke test
|
// Whether or not to run the full test-suite or just a smoke test
|
||||||
bool full_test_;
|
const bool full_test_;
|
||||||
|
|
||||||
// Retrieves the offset values to test with
|
// Retrieves the offset values to test with
|
||||||
const std::vector<size_t> GetOffsets() const;
|
const std::vector<size_t> GetOffsets() const;
|
||||||
|
|
|
@ -24,11 +24,13 @@ namespace clblast {
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
template <typename T, typename U>
|
template <typename T, typename U>
|
||||||
Client<T,U>::Client(const Routine run_routine, const Routine run_reference,
|
Client<T,U>::Client(const Routine run_routine,
|
||||||
|
const Routine run_reference1, const Routine run_reference2,
|
||||||
const std::vector<std::string> &options,
|
const std::vector<std::string> &options,
|
||||||
const GetMetric get_flops, const GetMetric get_bytes):
|
const GetMetric get_flops, const GetMetric get_bytes):
|
||||||
run_routine_(run_routine),
|
run_routine_(run_routine),
|
||||||
run_reference_(run_reference),
|
run_reference1_(run_reference1),
|
||||||
|
run_reference2_(run_reference2),
|
||||||
options_(options),
|
options_(options),
|
||||||
get_flops_(get_flops),
|
get_flops_(get_flops),
|
||||||
get_bytes_(get_bytes) {
|
get_bytes_(get_bytes) {
|
||||||
|
@ -90,7 +92,16 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
|
||||||
args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
|
args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
|
||||||
args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
|
args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
|
||||||
args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
|
args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
|
||||||
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
|
args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
|
||||||
|
#else
|
||||||
|
args.compare_clblas = 0;
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
args.compare_cblas = GetArgument(argc, argv, help, kArgComparecblas, 1);
|
||||||
|
#else
|
||||||
|
args.compare_cblas = 0;
|
||||||
|
#endif
|
||||||
args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
|
args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
|
||||||
args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
|
args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
|
||||||
args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
|
args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
|
||||||
|
@ -120,7 +131,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
||||||
auto device = Device(platform, args.device_id);
|
auto device = Device(platform, args.device_id);
|
||||||
auto context = Context(device);
|
auto context = Context(device);
|
||||||
auto queue = Queue(context, device);
|
auto queue = Queue(context, device);
|
||||||
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
if (args.compare_clblas) { clblasSetup(); }
|
if (args.compare_clblas) { clblasSetup(); }
|
||||||
|
#endif
|
||||||
|
|
||||||
// Iterates over all "num_step" values jumping by "step" each time
|
// Iterates over all "num_step" values jumping by "step" each time
|
||||||
auto s = size_t{0};
|
auto s = size_t{0};
|
||||||
|
@ -167,9 +180,13 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
||||||
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
|
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
|
||||||
timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
|
timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
|
||||||
if (args.compare_clblas) {
|
if (args.compare_clblas) {
|
||||||
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
|
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
|
||||||
timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
|
timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
|
||||||
}
|
}
|
||||||
|
if (args.compare_cblas) {
|
||||||
|
auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS");
|
||||||
|
timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas));
|
||||||
|
}
|
||||||
|
|
||||||
// Prints the performance of the tested libraries
|
// Prints the performance of the tested libraries
|
||||||
PrintTableRow(args, timings);
|
PrintTableRow(args, timings);
|
||||||
|
@ -186,7 +203,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cleans-up and returns
|
// Cleans-up and returns
|
||||||
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
if (args.compare_clblas) { clblasTeardown(); }
|
if (args.compare_clblas) { clblasTeardown(); }
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -196,14 +215,17 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
||||||
// value found in the vector of timing results. The return value is in milliseconds.
|
// value found in the vector of timing results. The return value is in milliseconds.
|
||||||
template <typename T, typename U>
|
template <typename T, typename U>
|
||||||
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
|
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
|
||||||
const Buffers<T> &buffers, Queue &queue,
|
Buffers<T> &buffers, Queue &queue,
|
||||||
Routine run_blas, const std::string &library_name) {
|
Routine run_blas, const std::string &library_name) {
|
||||||
auto timings = std::vector<double>(num_runs);
|
auto timings = std::vector<double>(num_runs);
|
||||||
for (auto &timing: timings) {
|
for (auto &timing: timings) {
|
||||||
auto start_time = std::chrono::steady_clock::now();
|
auto start_time = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
// Executes the main computation
|
// Executes the main computation
|
||||||
auto status = run_blas(args, buffers, queue);
|
auto status = StatusCode::kSuccess;
|
||||||
|
try {
|
||||||
|
status = run_blas(args, buffers, queue);
|
||||||
|
} catch (...) { status = static_cast<StatusCode>(kUnknownError); }
|
||||||
if (status != StatusCode::kSuccess) {
|
if (status != StatusCode::kSuccess) {
|
||||||
throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
|
throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
|
||||||
}
|
}
|
||||||
|
@ -226,6 +248,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
|
||||||
for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
|
for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
|
||||||
fprintf(stdout, " | <-- CLBlast -->");
|
fprintf(stdout, " | <-- CLBlast -->");
|
||||||
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
|
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
|
||||||
|
if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); }
|
||||||
fprintf(stdout, " |\n");
|
fprintf(stdout, " |\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -233,6 +256,7 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
|
||||||
for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
|
for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
|
||||||
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
|
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
|
||||||
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
|
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
|
||||||
|
if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,9 @@
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
// The libraries to test
|
// The libraries to test
|
||||||
#include <clBLAS.h>
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include <clBLAS.h>
|
||||||
|
#endif
|
||||||
#include "clblast.h"
|
#include "clblast.h"
|
||||||
|
|
||||||
#include "internal/utilities.h"
|
#include "internal/utilities.h"
|
||||||
|
@ -40,12 +42,12 @@ class Client {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// Shorthand for the routine-specific functions passed to the tester
|
// Shorthand for the routine-specific functions passed to the tester
|
||||||
using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers<T>&, Queue&)>;
|
using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
|
||||||
using SetMetric = std::function<void(Arguments<U>&)>;
|
using SetMetric = std::function<void(Arguments<U>&)>;
|
||||||
using GetMetric = std::function<size_t(const Arguments<U>&)>;
|
using GetMetric = std::function<size_t(const Arguments<U>&)>;
|
||||||
|
|
||||||
// The constructor
|
// The constructor
|
||||||
Client(const Routine run_routine, const Routine run_reference,
|
Client(const Routine run_routine, const Routine run_reference1, const Routine run_reference2,
|
||||||
const std::vector<std::string> &options,
|
const std::vector<std::string> &options,
|
||||||
const GetMetric get_flops, const GetMetric get_bytes);
|
const GetMetric get_flops, const GetMetric get_bytes);
|
||||||
|
|
||||||
|
@ -61,7 +63,7 @@ class Client {
|
||||||
private:
|
private:
|
||||||
|
|
||||||
// Runs a function a given number of times and returns the execution time of the shortest instance
|
// Runs a function a given number of times and returns the execution time of the shortest instance
|
||||||
double TimedExecution(const size_t num_runs, const Arguments<U> &args, const Buffers<T> &buffers,
|
double TimedExecution(const size_t num_runs, const Arguments<U> &args, Buffers<T> &buffers,
|
||||||
Queue &queue, Routine run_blas, const std::string &library_name);
|
Queue &queue, Routine run_blas, const std::string &library_name);
|
||||||
|
|
||||||
// Prints the header of a performance-data table
|
// Prints the header of a performance-data table
|
||||||
|
@ -73,7 +75,8 @@ class Client {
|
||||||
|
|
||||||
// The routine-specific functions passed to the tester
|
// The routine-specific functions passed to the tester
|
||||||
const Routine run_routine_;
|
const Routine run_routine_;
|
||||||
const Routine run_reference_;
|
const Routine run_reference1_;
|
||||||
|
const Routine run_reference2_;
|
||||||
const std::vector<std::string> options_;
|
const std::vector<std::string> options_;
|
||||||
const GetMetric get_flops_;
|
const GetMetric get_flops_;
|
||||||
const GetMetric get_bytes_;
|
const GetMetric get_bytes_;
|
||||||
|
@ -81,13 +84,31 @@ class Client {
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Bogus reference function, in case a comparison library is not available
|
||||||
|
template <typename T, typename U>
|
||||||
|
static StatusCode ReferenceNotAvailable(const Arguments<U> &, Buffers<T> &, Queue &) {
|
||||||
|
return StatusCode::kNotImplemented;
|
||||||
|
}
|
||||||
|
|
||||||
// The interface to the performance client. This is a separate function in the header such that it
|
// The interface to the performance client. This is a separate function in the header such that it
|
||||||
// is automatically compiled for each routine, templated by the parameter "C".
|
// is automatically compiled for each routine, templated by the parameter "C".
|
||||||
template <typename C, typename T, typename U>
|
template <typename C, typename T, typename U>
|
||||||
void RunClient(int argc, char *argv[]) {
|
void RunClient(int argc, char *argv[]) {
|
||||||
|
|
||||||
|
// Sets the reference to test against
|
||||||
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
const auto reference1 = C::RunReference1; // clBLAS when available
|
||||||
|
#else
|
||||||
|
const auto reference1 = ReferenceNotAvailable<T,U>;
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
const auto reference2 = C::RunReference2; // CBLAS when available
|
||||||
|
#else
|
||||||
|
const auto reference2 = ReferenceNotAvailable<T,U>;
|
||||||
|
#endif
|
||||||
|
|
||||||
// Creates a new client
|
// Creates a new client
|
||||||
auto client = Client<T,U>(C::RunRoutine, C::RunReference, C::GetOptions(),
|
auto client = Client<T,U>(C::RunRoutine, reference1, reference2, C::GetOptions(),
|
||||||
C::GetFlops, C::GetBytes);
|
C::GetFlops, C::GetBytes);
|
||||||
|
|
||||||
// Simple command line argument parser with defaults
|
// Simple command line argument parser with defaults
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -65,7 +70,7 @@ class TestXaxpy {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Axpy(args.n, args.alpha,
|
auto status = Axpy(args.n, args.alpha,
|
||||||
|
@ -77,7 +82,8 @@ class TestXaxpy {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXaxpy(args.n, args.alpha,
|
auto status = clblasXaxpy(args.n, args.alpha,
|
||||||
|
@ -87,6 +93,22 @@ class TestXaxpy {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXaxpy(args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -64,7 +69,7 @@ class TestXcopy {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Copy<T>(args.n,
|
auto status = Copy<T>(args.n,
|
||||||
|
@ -76,7 +81,8 @@ class TestXcopy {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXcopy<T>(args.n,
|
auto status = clblasXcopy<T>(args.n,
|
||||||
|
@ -86,6 +92,22 @@ class TestXcopy {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXcopy(args.n,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -68,7 +73,7 @@ class TestXdot {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Dot<T>(args.n,
|
auto status = Dot<T>(args.n,
|
||||||
|
@ -81,7 +86,8 @@ class TestXdot {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXdot<T>(args.n,
|
auto status = clblasXdot<T>(args.n,
|
||||||
|
@ -92,6 +98,25 @@ class TestXdot {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXdot(args.n,
|
||||||
|
scalar_cpu, args.dot_offset,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -68,7 +73,7 @@ class TestXdotc {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Dotc<T>(args.n,
|
auto status = Dotc<T>(args.n,
|
||||||
|
@ -81,7 +86,8 @@ class TestXdotc {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXdotc<T>(args.n,
|
auto status = clblasXdotc<T>(args.n,
|
||||||
|
@ -92,6 +98,25 @@ class TestXdotc {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXdotc(args.n,
|
||||||
|
scalar_cpu, args.dot_offset,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -68,7 +73,7 @@ class TestXdotu {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Dotu<T>(args.n,
|
auto status = Dotu<T>(args.n,
|
||||||
|
@ -81,7 +86,8 @@ class TestXdotu {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXdotu<T>(args.n,
|
auto status = clblasXdotu<T>(args.n,
|
||||||
|
@ -92,6 +98,25 @@ class TestXdotu {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXdotu(args.n,
|
||||||
|
scalar_cpu, args.dot_offset,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -64,7 +69,7 @@ class TestXnrm2 {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Nrm2<T>(args.n,
|
auto status = Nrm2<T>(args.n,
|
||||||
|
@ -76,7 +81,8 @@ class TestXnrm2 {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXnrm2<T>(args.n,
|
auto status = clblasXnrm2<T>(args.n,
|
||||||
|
@ -86,6 +92,22 @@ class TestXnrm2 {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> scalar_cpu(args.scalar_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
buffers.scalar.Read(queue, args.scalar_size, scalar_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
cblasXnrm2(args.n,
|
||||||
|
scalar_cpu, args.nrm2_offset,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc);
|
||||||
|
buffers.scalar.Write(queue, args.scalar_size, scalar_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -61,7 +66,7 @@ class TestXscal {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Scal(args.n, args.alpha,
|
auto status = Scal(args.n, args.alpha,
|
||||||
|
@ -72,7 +77,8 @@ class TestXscal {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXscal(args.n, args.alpha,
|
auto status = clblasXscal(args.n, args.alpha,
|
||||||
|
@ -81,6 +87,19 @@ class TestXscal {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
cblasXscal(args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc);
|
||||||
|
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -64,7 +69,7 @@ class TestXswap {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Swap<T>(args.n,
|
auto status = Swap<T>(args.n,
|
||||||
|
@ -76,7 +81,8 @@ class TestXswap {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXswap<T>(args.n,
|
auto status = clblasXswap<T>(args.n,
|
||||||
|
@ -86,6 +92,23 @@ class TestXswap {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXswap(args.n,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -76,7 +81,7 @@ class TestXgbmv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Gbmv(args.layout, args.a_transpose,
|
auto status = Gbmv(args.layout, args.a_transpose,
|
||||||
|
@ -90,7 +95,8 @@ class TestXgbmv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXgbmv(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -103,6 +109,27 @@ class TestXgbmv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXgbmv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
args.m, args.n, args.kl, args.ku, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -76,7 +81,7 @@ class TestXgemv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Gemv(args.layout, args.a_transpose,
|
auto status = Gemv(args.layout, args.a_transpose,
|
||||||
|
@ -90,7 +95,8 @@ class TestXgemv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -103,6 +109,27 @@ class TestXgemv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXgemv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
args.m, args.n, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -72,7 +77,7 @@ class TestXger {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Ger(args.layout,
|
auto status = Ger(args.layout,
|
||||||
|
@ -86,7 +91,8 @@ class TestXger {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXger(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXger(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -98,6 +104,26 @@ class TestXger {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXger(convertToCBLAS(args.layout),
|
||||||
|
args.m, args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld);
|
||||||
|
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -72,7 +77,7 @@ class TestXgerc {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Gerc(args.layout,
|
auto status = Gerc(args.layout,
|
||||||
|
@ -86,7 +91,8 @@ class TestXgerc {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXgerc(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXgerc(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -98,6 +104,26 @@ class TestXgerc {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXgerc(convertToCBLAS(args.layout),
|
||||||
|
args.m, args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld);
|
||||||
|
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -72,7 +77,7 @@ class TestXgeru {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Geru(args.layout,
|
auto status = Geru(args.layout,
|
||||||
|
@ -86,7 +91,8 @@ class TestXgeru {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXgeru(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXgeru(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -98,6 +104,26 @@ class TestXgeru {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXgeru(convertToCBLAS(args.layout),
|
||||||
|
args.m, args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld);
|
||||||
|
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -70,7 +75,7 @@ class TestXhbmv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Hbmv(args.layout, args.triangle,
|
auto status = Hbmv(args.layout, args.triangle,
|
||||||
|
@ -84,7 +89,8 @@ class TestXhbmv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXhbmv(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXhbmv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXhbmv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.kl, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -70,7 +75,7 @@ class TestXhemv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Hemv(args.layout, args.triangle,
|
auto status = Hemv(args.layout, args.triangle,
|
||||||
|
@ -84,7 +89,8 @@ class TestXhemv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXhemv(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXhemv(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXhemv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXhemv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -66,7 +71,7 @@ class TestXher {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Her(args.layout, args.triangle,
|
auto status = Her(args.layout, args.triangle,
|
||||||
|
@ -79,7 +84,8 @@ class TestXher {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXher(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXher(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -91,6 +97,24 @@ class TestXher {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
cblasXher(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld);
|
||||||
|
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -70,7 +75,7 @@ class TestXher2 {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Her2(args.layout, args.triangle,
|
auto status = Her2(args.layout, args.triangle,
|
||||||
|
@ -84,7 +89,8 @@ class TestXher2 {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXher2(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXher2(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXher2 {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXher2(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld);
|
||||||
|
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -70,7 +75,7 @@ class TestXhpmv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Hpmv(args.layout, args.triangle,
|
auto status = Hpmv(args.layout, args.triangle,
|
||||||
|
@ -84,7 +89,8 @@ class TestXhpmv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXhpmv(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXhpmv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXhpmv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
ap_mat_cpu, args.ap_offset,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -66,7 +71,7 @@ class TestXhpr {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Hpr(args.layout, args.triangle,
|
auto status = Hpr(args.layout, args.triangle,
|
||||||
|
@ -79,7 +84,8 @@ class TestXhpr {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXhpr(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXhpr(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -91,6 +97,24 @@ class TestXhpr {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
cblasXhpr(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
ap_mat_cpu, args.ap_offset);
|
||||||
|
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -70,7 +75,7 @@ class TestXhpr2 {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Hpr2(args.layout, args.triangle,
|
auto status = Hpr2(args.layout, args.triangle,
|
||||||
|
@ -84,7 +89,8 @@ class TestXhpr2 {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXhpr2(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXhpr2(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXhpr2 {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXhpr2(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc,
|
||||||
|
ap_mat_cpu, args.ap_offset);
|
||||||
|
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -70,7 +75,7 @@ class TestXsbmv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Sbmv(args.layout, args.triangle,
|
auto status = Sbmv(args.layout, args.triangle,
|
||||||
|
@ -84,7 +89,8 @@ class TestXsbmv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXsbmv(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXsbmv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXsbmv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.kl, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -70,7 +75,7 @@ class TestXspmv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Spmv(args.layout, args.triangle,
|
auto status = Spmv(args.layout, args.triangle,
|
||||||
|
@ -84,7 +89,8 @@ class TestXspmv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXspmv(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXspmv(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXspmv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXspmv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
ap_mat_cpu, args.ap_offset,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -66,7 +71,7 @@ class TestXspr {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Spr(args.layout, args.triangle,
|
auto status = Spr(args.layout, args.triangle,
|
||||||
|
@ -79,7 +84,8 @@ class TestXspr {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXspr(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXspr(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -91,6 +97,24 @@ class TestXspr {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
cblasXspr(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
ap_mat_cpu, args.ap_offset);
|
||||||
|
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -70,7 +75,7 @@ class TestXspr2 {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Spr2(args.layout, args.triangle,
|
auto status = Spr2(args.layout, args.triangle,
|
||||||
|
@ -84,7 +89,8 @@ class TestXspr2 {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXspr2(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXspr2(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXspr2 {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXspr2(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc,
|
||||||
|
ap_mat_cpu, args.ap_offset);
|
||||||
|
buffers.ap_mat.Write(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -70,7 +75,7 @@ class TestXsymv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Symv(args.layout, args.triangle,
|
auto status = Symv(args.layout, args.triangle,
|
||||||
|
@ -84,7 +89,8 @@ class TestXsymv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXsymv(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXsymv(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXsymv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXsymv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc, args.beta,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc);
|
||||||
|
buffers.y_vec.Write(queue, args.y_size, y_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -66,7 +71,7 @@ class TestXsyr {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Syr(args.layout, args.triangle,
|
auto status = Syr(args.layout, args.triangle,
|
||||||
|
@ -79,7 +84,8 @@ class TestXsyr {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXsyr(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXsyr(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -91,6 +97,24 @@ class TestXsyr {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
cblasXsyr(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld);
|
||||||
|
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -70,7 +75,7 @@ class TestXsyr2 {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Syr2(args.layout, args.triangle,
|
auto status = Syr2(args.layout, args.triangle,
|
||||||
|
@ -84,7 +89,8 @@ class TestXsyr2 {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXsyr2(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXsyr2(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXsyr2 {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
buffers.y_vec.Read(queue, args.y_size, y_vec_cpu);
|
||||||
|
cblasXsyr2(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.n, args.alpha,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc,
|
||||||
|
y_vec_cpu, args.y_offset, args.y_inc,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld);
|
||||||
|
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -65,7 +70,7 @@ class TestXtbmv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
auto status = Tbmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
||||||
|
@ -78,7 +83,8 @@ class TestXtbmv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXtbmv<T>(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -92,6 +98,26 @@ class TestXtbmv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
cblasXtbmv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
convertToCBLAS(args.diagonal),
|
||||||
|
args.n, args.kl,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc);
|
||||||
|
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -65,7 +70,7 @@ class TestXtpmv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
auto status = Tpmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
||||||
|
@ -78,7 +83,8 @@ class TestXtpmv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXtpmv<T>(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -92,6 +98,26 @@ class TestXtpmv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> ap_mat_cpu(args.ap_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
buffers.ap_mat.Read(queue, args.ap_size, ap_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
cblasXtpmv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
convertToCBLAS(args.diagonal),
|
||||||
|
args.n,
|
||||||
|
ap_mat_cpu, args.ap_offset,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc);
|
||||||
|
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -65,7 +70,7 @@ class TestXtrmv {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
auto status = Trmv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
||||||
|
@ -78,7 +83,8 @@ class TestXtrmv {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXtrmv<T>(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -92,6 +98,26 @@ class TestXtrmv {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||||
|
cblasXtrmv(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
convertToCBLAS(args.diagonal),
|
||||||
|
args.n,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
x_vec_cpu, args.x_offset, args.x_inc);
|
||||||
|
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -78,7 +83,7 @@ class TestXgemm {
|
||||||
static Transposes GetBTransposes(const Transposes &all) { return all; }
|
static Transposes GetBTransposes(const Transposes &all) { return all; }
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
|
auto status = Gemm(args.layout, args.a_transpose, args.b_transpose,
|
||||||
|
@ -92,7 +97,8 @@ class TestXgemm {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -106,6 +112,28 @@ class TestXgemm {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||||
|
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||||
|
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||||
|
cblasXgemm(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
convertToCBLAS(args.b_transpose),
|
||||||
|
args.m, args.n, args.k, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||||
|
c_mat_cpu, args.c_offset, args.c_ld);
|
||||||
|
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -78,7 +83,7 @@ class TestXhemm {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Hemm(args.layout, args.side, args.triangle,
|
auto status = Hemm(args.layout, args.side, args.triangle,
|
||||||
|
@ -92,7 +97,8 @@ class TestXhemm {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXhemm(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -106,6 +112,28 @@ class TestXhemm {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||||
|
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||||
|
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||||
|
cblasXhemm(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.side),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.m, args.n, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||||
|
c_mat_cpu, args.c_offset, args.c_ld);
|
||||||
|
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -76,7 +81,7 @@ class TestXher2k {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto alpha2 = T{args.alpha, args.alpha};
|
auto alpha2 = T{args.alpha, args.alpha};
|
||||||
|
@ -91,7 +96,8 @@ class TestXher2k {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto alpha2 = T{args.alpha, args.alpha};
|
auto alpha2 = T{args.alpha, args.alpha};
|
||||||
|
@ -106,6 +112,29 @@ class TestXher2k {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||||
|
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||||
|
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||||
|
auto alpha2 = T{args.alpha, args.alpha};
|
||||||
|
cblasXher2k(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
args.n, args.k, alpha2,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||||
|
c_mat_cpu, args.c_offset, args.c_ld);
|
||||||
|
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -69,7 +74,7 @@ class TestXherk {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Herk(args.layout, args.triangle, args.a_transpose,
|
auto status = Herk(args.layout, args.triangle, args.a_transpose,
|
||||||
|
@ -82,7 +87,8 @@ class TestXherk {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<U> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXherk(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -95,6 +101,25 @@ class TestXherk {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||||
|
cblasXherk(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
args.n, args.k, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld, args.beta,
|
||||||
|
c_mat_cpu, args.c_offset, args.c_ld);
|
||||||
|
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<U> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -78,7 +83,7 @@ class TestXsymm {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Symm(args.layout, args.side, args.triangle,
|
auto status = Symm(args.layout, args.side, args.triangle,
|
||||||
|
@ -92,7 +97,8 @@ class TestXsymm {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -106,6 +112,28 @@ class TestXsymm {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||||
|
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||||
|
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||||
|
cblasXsymm(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.side),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
args.m, args.n, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||||
|
c_mat_cpu, args.c_offset, args.c_ld);
|
||||||
|
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -76,7 +81,7 @@ class TestXsyr2k {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
|
auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
|
||||||
|
@ -90,7 +95,8 @@ class TestXsyr2k {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -104,6 +110,28 @@ class TestXsyr2k {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||||
|
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||||
|
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||||
|
cblasXsyr2k(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
args.n, args.k, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
b_mat_cpu, args.b_offset, args.b_ld, args.beta,
|
||||||
|
c_mat_cpu, args.c_offset, args.c_ld);
|
||||||
|
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -69,7 +74,7 @@ class TestXsyrk {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Syrk(args.layout, args.triangle, args.a_transpose,
|
auto status = Syrk(args.layout, args.triangle, args.a_transpose,
|
||||||
|
@ -82,7 +87,8 @@ class TestXsyrk {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -95,6 +101,25 @@ class TestXsyrk {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> c_mat_cpu(args.c_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.c_mat.Read(queue, args.c_size, c_mat_cpu);
|
||||||
|
cblasXsyrk(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
args.n, args.k, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld, args.beta,
|
||||||
|
c_mat_cpu, args.c_offset, args.c_ld);
|
||||||
|
buffers.c_mat.Write(queue, args.c_size, c_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
|
@ -19,7 +19,12 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
#include "wrapper_clblas.h"
|
||||||
|
#endif
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
#include "wrapper_cblas.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -69,7 +74,7 @@ class TestXtrmm {
|
||||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||||
|
|
||||||
// Describes how to run the CLBlast routine
|
// Describes how to run the CLBlast routine
|
||||||
static StatusCode RunRoutine(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
|
auto status = Trmm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
|
||||||
|
@ -82,7 +87,8 @@ class TestXtrmm {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||||
static StatusCode RunReference(const Arguments<T> &args, const Buffers<T> &buffers, Queue &queue) {
|
#ifdef CLBLAST_REF_CLBLAS
|
||||||
|
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event = cl_event{};
|
auto event = cl_event{};
|
||||||
auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
|
auto status = clblasXtrmm(static_cast<clblasOrder>(args.layout),
|
||||||
|
@ -97,6 +103,27 @@ class TestXtrmm {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||||
|
#ifdef CLBLAST_REF_CBLAS
|
||||||
|
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||||
|
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||||
|
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||||
|
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||||
|
cblasXtrmm(convertToCBLAS(args.layout),
|
||||||
|
convertToCBLAS(args.side),
|
||||||
|
convertToCBLAS(args.triangle),
|
||||||
|
convertToCBLAS(args.a_transpose),
|
||||||
|
convertToCBLAS(args.diagonal),
|
||||||
|
args.m, args.n, args.alpha,
|
||||||
|
a_mat_cpu, args.a_offset, args.a_ld,
|
||||||
|
b_mat_cpu, args.b_offset, args.b_ld);
|
||||||
|
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||||
|
|
1674
test/wrapper_cblas.h
Normal file
1674
test/wrapper_cblas.h
Normal file
File diff suppressed because it is too large
Load diff
|
@ -65,7 +65,7 @@ template <typename T>
|
||||||
clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
cl_mem sd2_buffer, const size_t sd2_offset,
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
cl_mem sx1_buffer, const size_t sx1_offset,
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
cl_mem sy1_buffer, const size_t sy1_offset,
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
cl_mem sparam_buffer, const size_t sparam_offset,
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
cl_uint num_queues, cl_command_queue *queues,
|
cl_uint num_queues, cl_command_queue *queues,
|
||||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
|
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
|
||||||
|
@ -73,7 +73,7 @@ template <>
|
||||||
clblasStatus clblasXrotmg<float>(cl_mem sd1_buffer, const size_t sd1_offset,
|
clblasStatus clblasXrotmg<float>(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
cl_mem sd2_buffer, const size_t sd2_offset,
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
cl_mem sx1_buffer, const size_t sx1_offset,
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
cl_mem sy1_buffer, const size_t sy1_offset,
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
cl_mem sparam_buffer, const size_t sparam_offset,
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
cl_uint num_queues, cl_command_queue *queues,
|
cl_uint num_queues, cl_command_queue *queues,
|
||||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||||
|
@ -88,7 +88,7 @@ template <>
|
||||||
clblasStatus clblasXrotmg<double>(cl_mem sd1_buffer, const size_t sd1_offset,
|
clblasStatus clblasXrotmg<double>(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
cl_mem sd2_buffer, const size_t sd2_offset,
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
cl_mem sx1_buffer, const size_t sx1_offset,
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
cl_mem sy1_buffer, const size_t sy1_offset,
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
cl_mem sparam_buffer, const size_t sparam_offset,
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
cl_uint num_queues, cl_command_queue *queues,
|
cl_uint num_queues, cl_command_queue *queues,
|
||||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||||
|
|
Loading…
Reference in a new issue