Merge pull request #7 from CNugteren/development

Update to version 0.2.0
This commit is contained in:
Cedric Nugteren 2015-06-21 09:15:41 +02:00
commit 18251df848
51 changed files with 2021 additions and 272 deletions

View file

@ -1,7 +1,18 @@
Version 0.2.0
- Added support for complex conjugate transpose
- Several host-code performance improvements
- Improved testing infrastructure and coverage
- Added level-2 routines:
* SGEMV/DGEMV/CGEMV/ZGEMV
- Added level-3 routines:
* CGEMM/ZGEMM
* CSYMM/ZSYMM
Version 0.1.0
- Initial preview version release to GitHub
- Supported level-1 routines:
SAXPY/DAXPY/CAXPY/ZAXPY
* SAXPY/DAXPY/CAXPY/ZAXPY
- Supported level-3 routines:
SGEMM/DGEMM, SSYMM/DSYMM
* SGEMM/DGEMM
* SSYMM/DSYMM

View file

@ -13,7 +13,7 @@
cmake_minimum_required(VERSION 2.8.10)
project("clblast" CXX)
set(clblast_VERSION_MAJOR 0)
set(clblast_VERSION_MINOR 1)
set(clblast_VERSION_MINOR 2)
set(clblast_VERSION_PATCH 0)
# Options and their default values
@ -93,11 +93,12 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
# ==================================================================================================
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy pad transpose padtranspose xaxpy xgemm)
set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
set(SAMPLE_PROGRAMS sgemm)
set(ROUTINES_XY xaxpy)
set(ROUTINES_AXY xgemv)
set(ROUTINES_ABC xgemm xsymm)
set(ROUTINES ${ROUTINES_XY} ${ROUTINES_ABC})
set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC})
# ==================================================================================================
@ -169,6 +170,7 @@ if(TESTS)
# Creates the common correctness-tests objects (requires CMake 2.8.8)
add_library(test_correctness_common OBJECT test/correctness/tester.cc)
add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
# Compiles the correctness-tests
@ -180,6 +182,14 @@ if(TESTS)
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
endforeach()
foreach(ROUTINE ${ROUTINES_AXY})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>
$<TARGET_OBJECTS:test_correctness_axy>
test/correctness/routines/${ROUTINE}.cc)
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
endforeach()
foreach(ROUTINE ${ROUTINES_ABC})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>

View file

@ -4,7 +4,7 @@ CLBlast: The tuned OpenCL BLAS library
CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version supports only a minimal amount of routines (including `sgemm` and `dgemm`): others will be added in due time. It also lacks extensive tuning and testing on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version supports only a minimal amount of routines (including `gemm` and `gemv`): others will be added in due time. It also lacks extensive tuning and testing on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
Why CLBlast and not clBLAS or cuBLAS?
@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of
| Level-2 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMV | | | | | |
| xGEMV |`x`|`x`|`x`|`x`| |
| xGBMV | | | | | |
| xHEMV | - | - | | | |
| xHBMV | - | - | | | |
@ -175,8 +175,8 @@ CLBlast is in active development and currently does not support the full set of
| Level-3 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMM |`x`|`x`| | | |
| xSYMM |`x`|`x`| | | |
| xGEMM |`x`|`x`|`x`|`x`| |
| xSYMM |`x`|`x`|`x`|`x`| |
| xHEMM | - | - | | | |
| xSYRK | | | | | |
| xHERK | - | - | | | |
@ -225,5 +225,4 @@ To-do list before release of version 1.0
- Further reduce the likelihood of crashes:
* Add checks for proper command-line arguments in the tuner, tester and client
* Add checks for valid database parameters
* Distinguish between short (smoke) and long tests
* Test in multi-threaded environments

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -660,7 +660,7 @@ checkMatrixSizes(
// Note: this is a hack to get the xsymm tests to work.
// TODO: Find out why "memUsed" is set to 0 in some cases!
memUsed = matrSize;
memUsed = offA + matrSize;
//printf("%lu required but found %lu\n", memUsed/tsize, memSize/tsize);
if (( memUsed > memSize ) || (offA + matrSize < offA)) {

View file

@ -85,7 +85,7 @@ enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
// Templated-precision vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
template <typename T>
StatusCode Axpy(const size_t m, const T alpha,
StatusCode Axpy(const size_t n, const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
@ -93,10 +93,21 @@ StatusCode Axpy(const size_t m, const T alpha,
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// Templated-precision generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
template <typename T>
StatusCode Gemv(const Layout layout, const Transpose transpose_a,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// Templated-precision generalized matrix multiplication: SGEMM/DGEMM
// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM
template <typename T>
StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
const size_t m, const size_t n, const size_t k,
@ -107,7 +118,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision symmetric matrix multiplication: SSYMM/DSYMM
// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM
template <typename T>
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,

View file

@ -134,8 +134,7 @@ class Platform: public Object {
}
// Accessors to the private data-member
cl_platform_id operator()() const { return platform_; }
cl_platform_id& operator()() { return platform_; }
const cl_platform_id& operator()() const { return platform_; }
private:
cl_platform_id platform_;
};
@ -193,8 +192,7 @@ class Device: public Object {
}
// Accessors to the private data-member
cl_device_id operator()() const { return device_; }
cl_device_id& operator()() { return device_; }
const cl_device_id& operator()() const { return device_; }
private:
// Helper functions
@ -259,8 +257,7 @@ class Context: public ObjectWithState {
}
// Accessors to the private data-member
cl_context operator()() const { return context_; }
cl_context& operator()() { return context_; }
const cl_context& operator()() const { return context_; }
private:
cl_context context_;
};
@ -296,16 +293,6 @@ class Program: public ObjectWithState {
swap(*this, other);
return *this;
}
/*
TODO: Implement move construction/assignment?
Program(Program &&other) {
clRetainProgram(program_);
swap(*this, other);
}
Program& operator=(Program &&other) {
swap(*this, other);
return *this;
}*/
friend void swap(Program &first, Program &second) {
std::swap(first.length_, second.length_);
std::swap(first.source_, second.source_);
@ -327,8 +314,7 @@ class Program: public ObjectWithState {
}
// Accessors to the private data-member
cl_program operator()() const { return program_; }
cl_program& operator()() { return program_; }
const cl_program& operator()() const { return program_; }
private:
size_t length_;
std::vector<char> source_;
@ -382,8 +368,7 @@ class Kernel: public ObjectWithState {
}
// Accessors to the private data-member
cl_kernel operator()() const { return kernel_; }
cl_kernel& operator()() { return kernel_; }
const cl_kernel& operator()() const { return kernel_; }
private:
cl_kernel kernel_;
};
@ -445,8 +430,7 @@ class CommandQueue: public ObjectWithState {
}
// Accessors to the private data-member
cl_command_queue operator()() const { return queue_; }
cl_command_queue& operator()() { return queue_; }
const cl_command_queue& operator()() const { return queue_; }
private:
cl_command_queue queue_;
};
@ -511,8 +495,7 @@ class Buffer: public ObjectWithState {
}
// Accessors to the private data-member
cl_mem operator()() const { return buffer_; }
cl_mem& operator()() { return buffer_; }
const cl_mem& operator()() const { return buffer_; }
private:
cl_mem buffer_;
};

View file

@ -54,6 +54,7 @@ class Database {
// The database consists of separate database entries, stored together in a vector
static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;

View file

@ -0,0 +1,129 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file populates the database with best-found tuning parameters for the Xgemv kernels.
//
// =================================================================================================
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvSingle = {
"Xgemv", Precision::kSingle, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"WGS2",256}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",4} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
{ "Iris", { {"WGS1",256}, {"WPT1",2}, {"WGS2",64}, {"WPT2",4}, {"VW2",4}, {"WGS3",256}, {"WPT3",2}, {"VW3",8} } },
}
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvDouble = {
"Xgemv", Precision::kDouble, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
}
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvComplexSingle = {
"Xgemv", Precision::kComplexSingle, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
{ "Iris", { {"WGS1",256}, {"WPT1",1}, {"WGS2",64}, {"WPT2",4}, {"VW2",2}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvComplexDouble = {
"Xgemv", Precision::kComplexDouble, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
}
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
}
};
// =================================================================================================
} // namespace clblast

View file

@ -29,11 +29,7 @@ namespace clblast {
class Routine {
public:
// Khronos OpenCL extensions
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
// New data-type:tThe cache of compiled OpenCL programs, along with some meta-data
// The cache of compiled OpenCL programs, along with some meta-data
struct ProgramCache {
Program program;
std::string device_name;
@ -95,13 +91,13 @@ class Routine {
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer &dest,
const bool do_transpose, const bool pad,
const Program &program);
const bool do_transpose, const bool do_conjugate,
const bool pad, const Program &program);
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
// The first assumes that the program is available in the cache and will throw an exception
// otherwise.
Program GetProgramFromCache() const;
const Program& GetProgramFromCache() const;
bool ProgramIsInCache() const;
// Non-static variable for the precision. Note that the same variable (but static) might exist in

View file

@ -0,0 +1,46 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xgemv routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XGEMV_H_
#define CLBLAST_ROUTINES_XGEMV_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xgemv: public Routine {
public:
Xgemv(CommandQueue &queue, Event &event);
// Templated-precision implementation of the routine
StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer &y_buffer, const size_t y_offset, const size_t y_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XGEMV_H_
#endif

View file

@ -34,10 +34,20 @@ using Tuner3 = std::function<void(const Arguments<T>&,
const std::vector<T>&, const std::vector<T>&, std::vector<T>&,
cltune::Tuner&)>;
// As above, but now with an additional ID for the variation
template <typename T>
using Tuner3V = std::function<void(const Arguments<T>&, const size_t,
const std::vector<T>&, const std::vector<T>&, std::vector<T>&,
cltune::Tuner&)>;
// Tuner for vector-vector input
template <typename T>
void TunerXY(int argc, char* argv[], const Tuner2<T> &tune_function);
// Tuner for matrix-vector-vector input
template <typename T>
void TunerAXY(int argc, char* argv[], const size_t num_variations, const Tuner3V<T> &tune_function);
// Tuner for matrix-matrix input
template <typename T>
void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function);

View file

@ -31,6 +31,10 @@ namespace clblast {
using float2 = std::complex<float>;
using double2 = std::complex<double>;
// Khronos OpenCL extensions
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
// =================================================================================================
// The routine-specific arguments in string form
@ -64,6 +68,9 @@ constexpr auto kArgStepSize = "step";
constexpr auto kArgNumSteps = "num_steps";
constexpr auto kArgNumRuns = "runs";
// The client-specific arguments in string form
constexpr auto kArgFullTest = "full_test";
// The common arguments in string form
constexpr auto kArgPlatform = "platform";
constexpr auto kArgDevice = "device";
@ -105,6 +112,8 @@ struct Arguments {
size_t step = 1;
size_t num_steps = 0;
size_t num_runs = 10;
// Tester-specific arguments
bool full_test = false;
// Common arguments
size_t platform_id = 0;
size_t device_id = 0;
@ -143,6 +152,10 @@ bool CheckArgument(const int argc, char *argv[], std::string &help, const std::s
// Returns a random number to be used as a seed
unsigned int GetRandomSeed();
// Test/example data lower and upper limit
constexpr auto kTestDataLowerLimit = -2.0;
constexpr auto kTestDataUpperLimit = 2.0;
// Populates a vector with random data
template <typename T>
void PopulateVector(std::vector<T> &vector);

View file

@ -20,6 +20,9 @@
// BLAS level-1 includes
#include "internal/routines/xaxpy.h"
// BLAS level-2 includes
#include "internal/routines/xgemv.h"
// BLAS level-3 includes
#include "internal/routines/xgemm.h"
#include "internal/routines/xsymm.h"
@ -36,18 +39,18 @@ StatusCode Axpy(const size_t n, const T alpha,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto xaxpy = Xaxpy<T>(queue_cpp, event_cpp);
auto routine = Xaxpy<T>(queue_cpp, event_cpp);
// Loads the kernel source-code as an include (C++11 raw string literal)
std::string kernel_source =
#include "kernels/xaxpy.opencl"
auto status = xaxpy.SetUp(kernel_source);
auto status = routine.SetUp(kernel_source);
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return xaxpy.DoAxpy(n, alpha,
Buffer(x_buffer), x_offset, x_inc,
Buffer(y_buffer), y_offset, y_inc);
return routine.DoAxpy(n, alpha,
Buffer(x_buffer), x_offset, x_inc,
Buffer(y_buffer), y_offset, y_inc);
}
template StatusCode Axpy<float>(const size_t, const float,
const cl_mem, const size_t, const size_t,
@ -69,22 +72,70 @@ template StatusCode Axpy<double2>(const size_t, const double2,
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// GEMV
template <typename T>
StatusCode Gemv(const Layout layout, const Transpose transpose_a,
const size_t m, const size_t n, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto routine = Xgemv<T>(queue_cpp, event_cpp);
// Loads the kernel source-code as an include (C++11 raw string literal)
std::string kernel_source =
#include "kernels/xgemv.opencl"
auto status = routine.SetUp(kernel_source);
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoGemv(layout, transpose_a, m, n, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(x_buffer), x_offset, x_inc, beta,
Buffer(y_buffer), y_offset, y_inc);
}
template StatusCode Gemv<float>(const Layout, const Transpose,
const size_t, const size_t, const float,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const float,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Gemv<double>(const Layout, const Transpose,
const size_t, const size_t, const double,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Gemv<float2>(const Layout, const Transpose,
const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const float2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Gemv<double2>(const Layout, const Transpose,
const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// GEMM
template <typename T>
StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
const size_t m, const size_t n, const size_t k,
const T alpha,
const size_t m, const size_t n, const size_t k, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto xgemm = Xgemm<T>(queue_cpp, event_cpp);
auto routine = Xgemm<T>(queue_cpp, event_cpp);
// Loads the kernel source-code as an include (C++11 raw string literal)
std::string common_source1 =
@ -97,69 +148,54 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
#include "kernels/padtranspose.opencl"
std::string kernel_source =
#include "kernels/xgemm.opencl"
auto status = xgemm.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
kernel_source);
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
kernel_source);
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return xgemm.DoGemm(layout, transpose_a, transpose_b,
m, n, k,
alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld,
beta,
Buffer(c_buffer), c_offset, c_ld);
return routine.DoGemm(layout, transpose_a, transpose_b, m, n, k, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld, beta,
Buffer(c_buffer), c_offset, c_ld);
}
template StatusCode Gemm<float>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
const float,
const size_t, const size_t, const size_t, const float,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const float,
const cl_mem, const size_t, const size_t, const float,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Gemm<double>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
const double,
const size_t, const size_t, const size_t, const double,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const double,
const cl_mem, const size_t, const size_t, const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
/*
template StatusCode Gemm<float2>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
const float2,
const size_t, const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const float2,
const cl_mem, const size_t, const size_t, const float2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Gemm<double2>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
const double2,
const size_t, const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const double2,
const cl_mem, const size_t, const size_t, const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
*/
// =================================================================================================
// SYMM
template <typename T>
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const size_t m, const size_t n, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto xsymm = Xsymm<T>(queue_cpp, event_cpp);
auto routine = Xsymm<T>(queue_cpp, event_cpp);
// Loads the kernel source-code as an include (C++11 raw string literal)
std::string common_source1 =
@ -172,53 +208,40 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
#include "kernels/padtranspose.opencl"
std::string kernel_source =
#include "kernels/xgemm.opencl"
auto status = xsymm.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
kernel_source);
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return xsymm.DoSymm(layout, side, triangle,
m, n,
alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld,
beta,
Buffer(c_buffer), c_offset, c_ld);
return routine.DoSymm(layout, side, triangle, m, n, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld, beta,
Buffer(c_buffer), c_offset, c_ld);
}
template StatusCode Symm<float>(const Layout, const Side, const Triangle,
const size_t, const size_t,
const float,
const size_t, const size_t, const float,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const float,
const cl_mem, const size_t, const size_t, const float,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Symm<double>(const Layout, const Side, const Triangle,
const size_t, const size_t,
const double,
const size_t, const size_t, const double,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const double,
const cl_mem, const size_t, const size_t, const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
/*
template StatusCode Symm<float2>(const Layout, const Side, const Triangle,
const size_t, const size_t,
const float2,
const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const float2,
const cl_mem, const size_t, const size_t, const float2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
const size_t, const size_t,
const double2,
const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const double2,
const cl_mem, const size_t, const size_t, const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
*/
// =================================================================================================
} // namespace clblast

View file

@ -13,6 +13,7 @@
#include "internal/database.h"
#include "internal/database/xaxpy.h"
#include "internal/database/xgemv.h"
#include "internal/database/xgemm.h"
#include "internal/database/copy.h"
#include "internal/database/pad.h"
@ -27,6 +28,7 @@ namespace clblast {
// Initializes the database
const std::vector<Database::DatabaseEntry> Database::database = {
XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,

View file

@ -112,6 +112,13 @@ R"(
#define AXPBY(e, a, b, c, d) e = a*b + c*d
#endif
// The complex conjugate operation for complex transforms
#if PRECISION == 3232 || PRECISION == 6464
#define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y
#else
#define COMPLEX_CONJUGATE(value) value = value
#endif
// =================================================================================================
// End of the C++11 raw string literal

View file

@ -47,7 +47,8 @@ __kernel void PadMatrix(const int src_one, const int src_two,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest) {
__global real* dest,
const int do_conjugate) {
// Loops over the work per thread in both dimensions
#pragma unroll
@ -67,6 +68,7 @@ __kernel void PadMatrix(const int src_one, const int src_two,
}
// Stores the value in the destination matrix
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
dest[id_two*dest_ld + id_one + dest_offset] = value;
}
}

View file

@ -40,7 +40,8 @@ __kernel void PadTransposeMatrix(const int src_one, const int src_two,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest) {
__global real* dest,
const int do_conjugate) {
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@ -83,12 +84,15 @@ __kernel void PadTransposeMatrix(const int src_one, const int src_two,
// Stores the transposed value in the destination matrix
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
}
}
}
}
// =================================================================================================
// Same as UnPadCopyMatrix, but now also does the transpose
__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
__kernel void UnPadTransposeMatrix(const int src_one, const int src_two,

View file

@ -293,42 +293,43 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
int idm = mg + get_group_id(0)*(MWG/VWM);
int idn = ng + get_group_id(1)*NWG;
int index = idn*(kSizeM/VWM) + idm;
realM cval = cgm[index];
#if VWM == 1
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cgm[index]);
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
#elif VWM == 2
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cgm[index].x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cgm[index].y);
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
#elif VWM == 4
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cgm[index].x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cgm[index].y);
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cgm[index].z);
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cgm[index].w);
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
#elif VWM == 8
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cgm[index].s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cgm[index].s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cgm[index].s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cgm[index].s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cgm[index].s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cgm[index].s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cgm[index].s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cgm[index].s7);
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
#elif VWM == 16
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cgm[index].s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cgm[index].s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cgm[index].s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cgm[index].s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cgm[index].s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cgm[index].s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cgm[index].s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cgm[index].s7);
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cgm[index].s8);
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cgm[index].s9);
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cgm[index].sA);
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cgm[index].sB);
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cgm[index].sC);
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cgm[index].sD);
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cgm[index].sE);
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cgm[index].sF);
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
#endif
}
}

373
src/kernels/xgemv.opencl Normal file
View file

@ -0,0 +1,373 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xgemv kernel for matrix-vector multiplication.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
// 1: For the full version of the kernel
#ifndef WGS1
#define WGS1 64 // The local work-group size
#endif
#ifndef WPT1
#define WPT1 1 // The amount of work-per-thread
#endif
// 2: For the fast version
#ifndef WGS2
#define WGS2 64 // The local work-group size
#endif
#ifndef WPT2
#define WPT2 1 // The amount of work-per-thread
#endif
#ifndef VW2
#define VW2 1 // Vector width of matrix A loads
#endif
// 3: For the fast rotated version
#ifndef WGS3
#define WGS3 64 // The local work-group size
#endif
#ifndef WPT3
#define WPT3 1 // The amount of work-per-thread
#endif
#ifndef VW3
#define VW3 1 // Vector width of matrix A loads
#endif
// =================================================================================================
// Full version of the kernel
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
const int a_rotated,
const __global real* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate) {
// Local memory for the vector X
__local real xlm[WGS1];
// Initializes the accumulation register
real acc[WPT1];
#pragma unroll
for (int w=0; w<WPT1; ++w) {
SetToZero(acc[w]);
}
// Divides the work in a main and tail section
const int n_tail = n % WGS1;
const int n_floor = n - n_tail;
// Loops over work-group sized portions of the work
for (int kwg=0; kwg<n_floor; kwg+=WGS1) {
// Loads the vector X into local memory
const int lid = get_local_id(0);
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
// Loops over the work per thread, and checks whether in bounds
#pragma unroll
for (int w=0; w<WPT1; ++w) {
const int gid = w*get_global_size(0) + get_global_id(0);
if (gid < m) {
// The multiply-add function for the main part (divisable by WGS1)
if (a_rotated == 0) { // Not rotated
#pragma unroll
for (int kl=0; kl<WGS1; ++kl) {
const int k = kwg + kl;
real value = agm[gid + a_ld*k + a_offset];
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xlm[kl], value);
}
}
else { // Transposed
#pragma unroll
for (int kl=0; kl<WGS1; ++kl) {
const int k = kwg + kl;
real value = agm[k + a_ld*gid + a_offset];
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xlm[kl], value);
}
}
}
}
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
}
// Loops over the work per thread, and checks whether in bounds
#pragma unroll
for (int w=0; w<WPT1; ++w) {
const int gid = w*get_global_size(0) + get_global_id(0);
if (gid < m) {
// The multiply-add function for the remainder part (not divisable by WGS1)
if (a_rotated == 0) { // Not rotated
#pragma unroll
for (int k=n_floor; k<n; ++k) {
real value = agm[gid + a_ld*k + a_offset];
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value);
}
}
else { // Transposed
#pragma unroll
for (int k=n_floor; k<n; ++k) {
real value = agm[k + a_ld*gid + a_offset];
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xgm[k*x_inc + x_offset], value);
}
}
// Stores the final result
real yval = ygm[gid*y_inc + y_offset];
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
}
}
}
// =================================================================================================
// Data-widths for the 'fast' kernel
#if VW2 == 1
typedef real realVF;
#elif VW2 == 2
typedef real2 realVF;
#elif VW2 == 4
typedef real4 realVF;
#elif VW2 == 8
typedef real8 realVF;
#elif VW2 == 16
typedef real16 realVF;
#endif
// Faster version of the kernel, assuming that:
// --> 'm' and 'n' are multiples of WGS2
// --> 'a_offset' is 0
// --> 'a_ld' is a multiple of VW2
// --> 'a_rotated' is 0
// --> 'do_conjugate' is 0
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
const int a_rotated,
const __global realVF* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate) {
// Local memory for the vector X
__local real xlm[WGS2];
// Initializes the accumulation register
real acc[WPT2];
#pragma unroll
for (int w=0; w<WPT2; ++w) {
SetToZero(acc[w]);
}
// Loops over work-group sized portions of the work
for (int kwg=0; kwg<n; kwg+=WGS2) {
// Loads the vector X into local memory
const int lid = get_local_id(0);
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
// The multiply-add function (not rotated)
#pragma unroll
for (int kl=0; kl<WGS2; ++kl) {
const int k = kwg + kl;
#pragma unroll
for (int w=0; w<WPT2/VW2; ++w) {
const int gid = (WPT2/VW2)*get_global_id(0) + w;
#if VW2 == 1
MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k]);
#elif VW2 == 2
MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].x);
MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].y);
#elif VW2 == 4
MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].x);
MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].y);
MultiplyAdd(acc[VW2*w+2], xlm[kl], agm[gid + (a_ld/VW2)*k].z);
MultiplyAdd(acc[VW2*w+3], xlm[kl], agm[gid + (a_ld/VW2)*k].w);
#elif VW2 == 8
MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].s0);
MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].s1);
MultiplyAdd(acc[VW2*w+2], xlm[kl], agm[gid + (a_ld/VW2)*k].s2);
MultiplyAdd(acc[VW2*w+3], xlm[kl], agm[gid + (a_ld/VW2)*k].s3);
MultiplyAdd(acc[VW2*w+4], xlm[kl], agm[gid + (a_ld/VW2)*k].s4);
MultiplyAdd(acc[VW2*w+5], xlm[kl], agm[gid + (a_ld/VW2)*k].s5);
MultiplyAdd(acc[VW2*w+6], xlm[kl], agm[gid + (a_ld/VW2)*k].s6);
MultiplyAdd(acc[VW2*w+7], xlm[kl], agm[gid + (a_ld/VW2)*k].s7);
#elif VW2 == 16
MultiplyAdd(acc[VW2*w+0], xlm[kl], agm[gid + (a_ld/VW2)*k].s0);
MultiplyAdd(acc[VW2*w+1], xlm[kl], agm[gid + (a_ld/VW2)*k].s1);
MultiplyAdd(acc[VW2*w+2], xlm[kl], agm[gid + (a_ld/VW2)*k].s2);
MultiplyAdd(acc[VW2*w+3], xlm[kl], agm[gid + (a_ld/VW2)*k].s3);
MultiplyAdd(acc[VW2*w+4], xlm[kl], agm[gid + (a_ld/VW2)*k].s4);
MultiplyAdd(acc[VW2*w+5], xlm[kl], agm[gid + (a_ld/VW2)*k].s5);
MultiplyAdd(acc[VW2*w+6], xlm[kl], agm[gid + (a_ld/VW2)*k].s6);
MultiplyAdd(acc[VW2*w+7], xlm[kl], agm[gid + (a_ld/VW2)*k].s7);
MultiplyAdd(acc[VW2*w+8], xlm[kl], agm[gid + (a_ld/VW2)*k].s8);
MultiplyAdd(acc[VW2*w+9], xlm[kl], agm[gid + (a_ld/VW2)*k].s9);
MultiplyAdd(acc[VW2*w+10], xlm[kl], agm[gid + (a_ld/VW2)*k].sA);
MultiplyAdd(acc[VW2*w+11], xlm[kl], agm[gid + (a_ld/VW2)*k].sB);
MultiplyAdd(acc[VW2*w+12], xlm[kl], agm[gid + (a_ld/VW2)*k].sC);
MultiplyAdd(acc[VW2*w+13], xlm[kl], agm[gid + (a_ld/VW2)*k].sD);
MultiplyAdd(acc[VW2*w+14], xlm[kl], agm[gid + (a_ld/VW2)*k].sE);
MultiplyAdd(acc[VW2*w+15], xlm[kl], agm[gid + (a_ld/VW2)*k].sF);
#endif
}
}
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the final result
#pragma unroll
for (int w=0; w<WPT2; ++w) {
const int gid = WPT2*get_global_id(0) + w;
real yval = ygm[gid*y_inc + y_offset];
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
}
}
// =================================================================================================
// Data-widths for the 'fast' kernel with rotated matrix
#if VW3 == 1
typedef real realVFR;
#elif VW3 == 2
typedef real2 realVFR;
#elif VW3 == 4
typedef real4 realVFR;
#elif VW3 == 8
typedef real8 realVFR;
#elif VW3 == 16
typedef real16 realVFR;
#endif
// Faster version of the kernel, assuming that:
// --> 'm' and 'n' are multiples of WGS3
// --> 'a_offset' is 0
// --> 'a_ld' is a multiple of VW3
// --> 'a_rotated' is 1
// --> 'do_conjugate' is 0
__attribute__((reqd_work_group_size(WGS3, 1, 1)))
__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
const int a_rotated,
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate) {
// Local memory for the vector X
__local real xlm[WGS3];
// Initializes the accumulation register
real acc[WPT3];
#pragma unroll
for (int w=0; w<WPT3; ++w) {
SetToZero(acc[w]);
}
// Loops over work-group sized portions of the work
for (int kwg=0; kwg<n; kwg+=WGS3) {
// Loads the vector X into local memory
const int lid = get_local_id(0);
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
// The multiply-add function (rotated)
#pragma unroll
for (int kl=0; kl<WGS3/VW3; ++kl) {
const int k = (kwg/VW3) + kl;
#pragma unroll
for (int w=0; w<WPT3; ++w) {
const int gid = WPT3*get_global_id(0) + w;
realVFR avec = agm[k + (a_ld/VW3)*gid];
#if VW3 == 1
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
#elif VW3 == 2
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
#elif VW3 == 4
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
#elif VW3 == 8
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
#elif VW3 == 16
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
#endif
}
}
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the final result
#pragma unroll
for (int w=0; w<WPT3; ++w) {
const int gid = WPT3*get_global_id(0) + w;
real yval = ygm[gid*y_inc + y_offset];
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
}
}
// =================================================================================================
// End of the C++11 raw string literal
)";
// =================================================================================================

View file

@ -209,11 +209,11 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer &dest,
const bool do_transpose, const bool pad,
const Program &program) {
const bool do_transpose, const bool do_conjugate,
const bool pad, const Program &program) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) &&
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
// Determines the right kernel
@ -264,6 +264,9 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
if (pad) {
kernel.SetArgument(10, static_cast<int>(do_conjugate));
}
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
@ -305,7 +308,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
// otherwise.
Program Routine::GetProgramFromCache() const {
const Program& Routine::GetProgramFromCache() const {
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(device_name_, precision_, routines_)) {
return cached_program.program;

View file

@ -60,7 +60,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
// Retrieves the Xaxpy kernel from the compiled binary
try {
auto program = GetProgramFromCache();
auto& program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
@ -88,8 +88,8 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
status = RunKernel(kernel, global, local);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]);
auto global = std::vector<size_t>{CeilDiv(n_ceiled, db_["WPT"])};
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local);
}

View file

@ -63,6 +63,10 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
auto b_do_transpose = !b_rotated;
auto c_do_transpose = c_rotated;
// In case of complex data-types, the transpose can also become a conjugate transpose
auto a_conjugate = (a_transpose == Transpose::kConjugate);
auto b_conjugate = (b_transpose == Transpose::kConjugate);
// Computes the first and second dimensions of the 3 matrices taking into account whether the
// matrices are rotated or not
auto a_one = (a_rotated) ? k : m;
@ -98,24 +102,24 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
// Loads the program from the database
auto program = GetProgramFromCache();
auto& program = GetProgramFromCache();
// Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
// them up until they reach a certain multiple of size (kernel parameter dependent).
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
a_do_transpose, true, program);
a_do_transpose, a_conjugate, true, program);
if (ErrorIn(status)) { return status; }
status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
b_do_transpose, true, program);
b_do_transpose, b_conjugate, true, program);
if (ErrorIn(status)) { return status; }
// Only necessary for matrix C if it used both as input and output
if (beta != static_cast<T>(0)) {
status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
c_do_transpose, true, program);
c_do_transpose, false, true, program);
if (ErrorIn(status)) { return status; }
}
@ -147,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// Runs the post-processing kernel
status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
c_one, c_two, c_ld, c_offset, c_buffer,
c_do_transpose, false, program);
c_do_transpose, false, false, program);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation

146
src/routines/xgemv.cc Normal file
View file

@ -0,0 +1,146 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xgemv class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/xgemv.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
Routine(queue, event, {"Xgemv"}, precision_) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
// Computes whether or not the matrix has an alternative layout (row or column-major).
auto a_altlayout = (layout == Layout::kRowMajor);
auto a_one = (a_altlayout) ? n : m;
auto a_two = (a_altlayout) ? m : n;
// Swap m and n if the matrix is transposed
auto a_transposed = (a_transpose != Transpose::kNo);
auto m_real = (a_transposed) ? n : m;
auto n_real = (a_transposed) ? m : n;
// Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
auto a_rotated = a_transposed ^ a_altlayout;
// In case of complex data-types, the transpose can also become a conjugate transpose
auto a_conjugate = (a_transpose == Transpose::kConjugate);
// Tests the matrix and the vectors for validity
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
IsMultiple(n, db_["WGS2"]) &&
IsMultiple(a_ld, db_["VW2"]);
bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
IsMultiple(n, db_["WGS3"]) &&
IsMultiple(a_ld, db_["VW3"]);
// If possible, run the fast-version (rotated or non-rotated) of the kernel
auto kernel_name = "Xgemv";
auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
auto global_size = m_ceiled / db_["WPT1"];
auto local_size = db_["WGS1"];
if (use_fast_kernel) {
kernel_name = "XgemvFast";
global_size = m_real / db_["WPT2"];
local_size = db_["WGS2"];
}
if (use_fast_kernel_rot) {
kernel_name = "XgemvFastRot";
global_size = m_real / db_["WPT3"];
local_size = db_["WGS3"];
}
// Retrieves the Xgemv kernel from the compiled binary
try {
auto& program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_real));
kernel.SetArgument(1, static_cast<int>(n_real));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, beta);
kernel.SetArgument(4, static_cast<int>(a_rotated));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, x_buffer());
kernel.SetArgument(9, static_cast<int>(x_offset));
kernel.SetArgument(10, static_cast<int>(x_inc));
kernel.SetArgument(11, y_buffer());
kernel.SetArgument(12, static_cast<int>(y_offset));
kernel.SetArgument(13, static_cast<int>(y_inc));
kernel.SetArgument(14, static_cast<int>(a_conjugate));
// Launches the kernel
auto global = std::vector<size_t>{global_size};
auto local = std::vector<size_t>{local_size};
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
// Compiles the templated class
template class Xgemv<float>;
template class Xgemv<double>;
template class Xgemv<float2>;
template class Xgemv<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -61,7 +61,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
// Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
// routine afterwards
try {
auto program = GetProgramFromCache();
auto& program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Sets the arguments for the symmetric-to-squared kernel

View file

@ -63,6 +63,7 @@ void PadTune(const Arguments<T> &args,
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(0);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentScalar(0);
}
// =================================================================================================

View file

@ -68,6 +68,7 @@ void PadTransposeTune(const Arguments<T> &args,
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(0);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentScalar(0);
}
// =================================================================================================

View file

@ -72,6 +72,69 @@ template void TunerXY<double2>(int, char**, const Tuner2<double2>&);
// =================================================================================================
// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
// the results. Used for matrix-vector-vector routines.
template <typename T>
void TunerAXY(int argc, char* argv[], const size_t num_variations,
const Tuner3V<T> &tune_function) {
// Sets the parameters and platform/device for which to tune (command-line options)
auto help = std::string{"* Options given/available:\n"};
auto args = Arguments<T>{};
args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
args.m = GetArgument(argc, argv, help, kArgM, size_t{2048});
args.n = GetArgument(argc, argv, help, kArgN, size_t{2048});
args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>());
fprintf(stdout, "%s\n", help.c_str());
// Creates input buffers with random data
auto a_mat = std::vector<T>(args.m * args.n);
auto x_vec = std::vector<T>(args.n);
auto y_vec = std::vector<T>(args.m);
PopulateVector(a_mat);
PopulateVector(x_vec);
PopulateVector(y_vec);
// Loop over the different variations of the kernel
for (auto variation=size_t{1}; variation<=num_variations; ++variation) {
// Initializes the tuner for the chosen device
cltune::Tuner tuner(args.platform_id, args.device_id);
// Use full-search to explore all parameter combinations.
tuner.UseFullSearch();
// Configures the tuning parameters (kernel specific)
tune_function(args, variation, a_mat, x_vec, y_vec, tuner);
// Starts the tuning process
tuner.Tune();
// Prints the results to screen
auto time_ms = tuner.PrintToScreen();
tuner.PrintFormatted();
// Also prints the performance of the best-case in terms of GB/s and GFLOPS
const auto mega_bytes = ((args.m*args.n + 2*args.m + args.n)*GetBytes(args.precision)) * 1.0e-6;
const auto mega_flops = (2*args.m*args.n) * 1.0e-6;
if (time_ms != 0.0) {
printf("[ -------> ] %.1lf ms or %.1lf GB/s or %.1lf GFLOPS\n",
time_ms, mega_bytes/time_ms, mega_flops/time_ms);
}
}
}
// Compiles the above function
template void TunerAXY<float>(int, char**, const size_t, const Tuner3V<float>&);
template void TunerAXY<double>(int, char**, const size_t, const Tuner3V<double>&);
template void TunerAXY<float2>(int, char**, const size_t, const Tuner3V<float2>&);
template void TunerAXY<double2>(int, char**, const size_t, const Tuner3V<double2>&);
// =================================================================================================
// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
// the results. Used for matrix-matrix routines.
template <typename T>

119
src/tuning/xgemv.cc Normal file
View file

@ -0,0 +1,119 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements an auto-tuner to tune the Xgemv OpenCL kernel. It uses the CLTune library.
// Three variations of the kernel are tuned:
// 1: The full version of the kernel
// 2: The fast version for non-transposed matrices
// 3: The fast version for transposed matrices
//
// =================================================================================================
#include <string>
#include <vector>
#include <stdexcept>
#include "internal/utilities.h"
#include "internal/tuning.h"
namespace clblast {
// =================================================================================================
// The Xgemv auto-tuner
template <typename T>
void XgemvTune(const Arguments<T> &args, const size_t variation,
const std::vector<T> &a_mat, const std::vector<T> &x_vec, std::vector<T> &y_vec,
cltune::Tuner &tuner) {
// Sets the kernel name and the layout argument
auto kernel_name = (variation == 1) ? "Xgemv" : ((variation == 2) ? "XgemvFast" : "XgemvFastRot");
auto a_rotated = (variation == 3) ? 1 : 0;
// This points to the Xgemv kernel as found in the CLBlast library
std::string common_source =
#include "../src/kernels/common.opencl"
std::string kernel_source =
#include "../src/kernels/xgemv.opencl"
auto sources = common_source + kernel_source;
auto id = tuner.AddKernelFromString(sources, kernel_name, {args.m}, {1});
tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
// Helper for the constraints
auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
// Sets the tunable parameters, their possible values, the adjusted thread sizes, and constraints
if (variation == 1) {
tuner.AddParameter(id, "WGS1", {64, 128, 256, 512, 1024, 1536, 2048});
tuner.AddParameter(id, "WPT1", {1, 2, 4, 8});
tuner.MulLocalSize(id, {"WGS1"});
tuner.DivGlobalSize(id, {"WPT1"});
}
else if (variation == 2) {
tuner.AddParameter(id, "WGS2", {64, 128, 256, 512, 1024, 1536, 2048});
tuner.AddParameter(id, "WPT2", {1, 2, 4, 8});
tuner.AddParameter(id, "VW2", {1, 2, 4, 8});
tuner.MulLocalSize(id, {"WGS2"});
tuner.DivGlobalSize(id, {"WPT2"});
tuner.AddConstraint(id, MultipleOfX, {"WPT2", "VW2"});
}
else if (variation == 3) {
tuner.AddParameter(id, "WGS3", {64, 128, 256, 512, 1024, 1536, 2048});
tuner.AddParameter(id, "WPT3", {1, 2, 4, 8});
tuner.AddParameter(id, "VW3", {1, 2, 4, 8});
tuner.MulLocalSize(id, {"WGS3"});
tuner.DivGlobalSize(id, {"WPT3"});
tuner.AddConstraint(id, MultipleOfX, {"WGS3", "VW3"});
}
// Tests for a specific precision
tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
// Sets the function's arguments
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(args.alpha);
tuner.AddArgumentScalar(args.beta);
tuner.AddArgumentScalar(static_cast<int>(a_rotated));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentScalar(0);
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentInput(x_vec);
tuner.AddArgumentScalar(0);
tuner.AddArgumentScalar(1);
tuner.AddArgumentOutput(y_vec);
tuner.AddArgumentScalar(0);
tuner.AddArgumentScalar(1);
tuner.AddArgumentScalar(0); // Conjugate transpose
}
// =================================================================================================
// Main function which calls the common client code with the routine-specific function as argument.
void TunerXgemv(int argc, char *argv[]) {
auto num_variations = size_t{3};
switch(GetPrecision(argc, argv)) {
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case Precision::kSingle: TunerAXY<float>(argc, argv, num_variations, XgemvTune<float>); break;
case Precision::kDouble: TunerAXY<double>(argc, argv, num_variations, XgemvTune<double>); break;
case Precision::kComplexSingle: TunerAXY<float2>(argc, argv, num_variations, XgemvTune<float2>); break;
case Precision::kComplexDouble: TunerAXY<double2>(argc, argv, num_variations, XgemvTune<double2>); break;
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::TunerXgemv(argc, argv);
return 0;
}
// =================================================================================================

View file

@ -159,16 +159,21 @@ Precision GetPrecision(const int argc, char *argv[]) {
bool CheckArgument(const int argc, char *argv[], std::string &help,
const std::string &option) {
// Updates the help message
help += " -"+option+"\n";
// Parses the argument. Note that this supports both the given option (e.g. -device) and one with
// an extra dash in front (e.g. --device).
auto return_value = false;
for (int c=0; c<argc; ++c) {
auto item = std::string{argv[c]};
if (item.compare("-"+option) == 0 || item.compare("--"+option) == 0) { ++c; return true; }
if (item.compare("-"+option) == 0 || item.compare("--"+option) == 0) {
++c;
return_value = true;
}
}
return false;
// Updates the help message and returns
help += " -"+option+" ";
help += (return_value) ? "[true]\n" : "[false]\n";
return return_value;
}
// =================================================================================================
@ -182,8 +187,10 @@ unsigned int GetRandomSeed() {
// Create a random number generator and populates a vector with samples from a random distribution
template <typename T>
void PopulateVector(std::vector<T> &vector) {
auto lower_limit = static_cast<T>(kTestDataLowerLimit);
auto upper_limit = static_cast<T>(kTestDataUpperLimit);
std::mt19937 mt(GetRandomSeed());
std::uniform_real_distribution<T> dist(static_cast<T>(-2.0), static_cast<T>(2.0));
std::uniform_real_distribution<T> dist(lower_limit, upper_limit);
for (auto &element: vector) { element = dist(mt); }
}
template void PopulateVector<float>(std::vector<float>&);
@ -192,14 +199,18 @@ template void PopulateVector<double>(std::vector<double>&);
// Specialized versions of the above for complex data-types
template <>
void PopulateVector(std::vector<float2> &vector) {
auto lower_limit = static_cast<float>(kTestDataLowerLimit);
auto upper_limit = static_cast<float>(kTestDataUpperLimit);
std::mt19937 mt(GetRandomSeed());
std::uniform_real_distribution<float> dist(-2.0f, 2.0f);
std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
}
template <>
void PopulateVector(std::vector<double2> &vector) {
auto lower_limit = static_cast<double>(kTestDataLowerLimit);
auto upper_limit = static_cast<double>(kTestDataUpperLimit);
std::mt19937 mt(GetRandomSeed());
std::uniform_real_distribution<double> dist(-2.0, 2.0);
std::uniform_real_distribution<double> dist(lower_limit, upper_limit);
for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
}

View file

@ -46,19 +46,13 @@ void XaxpyTest(int argc, char *argv[], const bool silent, const std::string &nam
return static_cast<StatusCode>(status);
};
// Selects the platform and device on which to test (command-line options)
auto help = std::string{"Options given/available:\n"};
const auto platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
const auto device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); }
// Initializes the other arguments relevant for this routine
// Initializes the arguments relevant for this routine
auto args = Arguments<T>{};
const auto options = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
kArgXOffset, kArgYOffset, kArgAlpha};
// Creates a tester
TestXY<T> tester{platform_id, device_id, name, options, clblast_lambda, clblas_lambda};
TestXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
// Runs the tests
const auto case_name = "default";

View file

@ -56,13 +56,7 @@ void XgemmTest(int argc, char *argv[], const bool silent, const std::string &nam
return static_cast<StatusCode>(status);
};
// Selects the platform and device on which to test (command-line options)
auto help = std::string{"Options given/available:\n"};
const auto platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
const auto device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); }
// Initializes the other arguments relevant for this routine
// Initializes the arguments relevant for this routine
auto args = Arguments<T>{};
const auto options = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
kArgATransp, kArgBTransp,
@ -70,14 +64,14 @@ void XgemmTest(int argc, char *argv[], const bool silent, const std::string &nam
kArgAOffset, kArgBOffset, kArgCOffset};
// Creates a tester
TestABC<T> tester{platform_id, device_id, name, options, clblast_lambda, clblas_lambda};
TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: {Layout::kRowMajor, Layout::kColMajor}) {
for (auto &layout: tester.kLayouts) {
args.layout = layout;
for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) {
for (auto &a_transpose: tester.kTransposes) {
args.a_transpose = a_transpose;
for (auto &b_transpose: {Transpose::kNo, Transpose::kYes}) {
for (auto &b_transpose: tester.kTransposes) {
args.b_transpose = b_transpose;
const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
@ -96,8 +90,8 @@ void XgemmTest(int argc, char *argv[], const bool silent, const std::string &nam
int main(int argc, char *argv[]) {
clblast::XgemmTest<float>(argc, argv, false, "SGEMM");
clblast::XgemmTest<double>(argc, argv, true, "DGEMM");
//clblast::XgemmTest<float2>(argc, argv, true, "CGEMM");
//clblast::XgemmTest<double2>(argc, argv, true, "ZGEMM");
clblast::XgemmTest<clblast::float2>(argc, argv, true, "CGEMM");
clblast::XgemmTest<clblast::double2>(argc, argv, true, "ZGEMM");
return 0;
}

View file

@ -0,0 +1,88 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xgemv routine. It is based on the TestAXY class.
//
// =================================================================================================
#include "wrapper_clblas.h"
#include "correctness/testaxy.h"
namespace clblast {
// =================================================================================================
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
template <typename T>
void XgemvTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates the CLBlast lambda
auto clblast_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
return Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha,
a_mat(), args.a_offset, args.a_ld,
x_vec(), args.x_offset, args.x_inc, args.beta,
y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
};
// Creates the clBLAS lambda (for comparison)
auto clblas_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
static_cast<clblasTranspose>(args.a_transpose),
args.m, args.n, args.alpha,
a_mat(), args.a_offset, args.a_ld,
x_vec(), args.x_offset, args.x_inc, args.beta,
y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
return static_cast<StatusCode>(status);
};
// Initializes the arguments relevant for this routine
auto args = Arguments<T>{};
const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout, kArgATransp,
kArgALeadDim, kArgXInc, kArgYInc,
kArgAOffset, kArgXOffset, kArgYOffset};
// Creates a tester
TestAXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) {
args.layout = layout;
for (auto &a_transpose: tester.kTransposes) {
args.a_transpose = a_transpose;
const auto case_name = ToString(layout)+" "+ToString(a_transpose);
// Runs the tests
tester.TestRegular(args, case_name);
tester.TestInvalidBufferSizes(args, case_name);
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::XgemvTest<float>(argc, argv, false, "SGEMV");
clblast::XgemvTest<double>(argc, argv, true, "DGEMV");
clblast::XgemvTest<clblast::float2>(argc, argv, true, "CGEMV");
clblast::XgemvTest<clblast::double2>(argc, argv, true, "ZGEMV");
return 0;
}
// =================================================================================================

View file

@ -56,13 +56,7 @@ void XsymmTest(int argc, char *argv[], const bool silent, const std::string &nam
return static_cast<StatusCode>(status);
};
// Selects the platform and device on which to test (command-line options)
auto help = std::string{"Options given/available:\n"};
const auto platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
const auto device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); }
// Initializes the other arguments relevant for this routine
// Initializes the arguments relevant for this routine
auto args = Arguments<T>{};
const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout,
kArgSide, kArgTriangle,
@ -70,10 +64,10 @@ void XsymmTest(int argc, char *argv[], const bool silent, const std::string &nam
kArgAOffset, kArgBOffset, kArgCOffset};
// Creates a tester
TestABC<T> tester{platform_id, device_id, name, options, clblast_lambda, clblas_lambda};
TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: {Layout::kRowMajor, Layout::kColMajor}) {
for (auto &layout: tester.kLayouts) {
args.layout = layout;
for (auto &side: {Side::kLeft, Side::kRight}) {
args.side = side;
@ -96,8 +90,8 @@ void XsymmTest(int argc, char *argv[], const bool silent, const std::string &nam
int main(int argc, char *argv[]) {
clblast::XsymmTest<float>(argc, argv, false, "SSYMM");
clblast::XsymmTest<double>(argc, argv, true, "DSYMM");
//clblast::XsymmTest<float2>(argc, argv, true, "CSYMM");
//clblast::XsymmTest<double2>(argc, argv, true, "ZSYMM");
clblast::XsymmTest<clblast::float2>(argc, argv, true, "CSYMM");
clblast::XsymmTest<clblast::double2>(argc, argv, true, "ZSYMM");
return 0;
}

View file

@ -20,10 +20,10 @@ namespace clblast {
// Constructor, initializes the base class tester and input data
template <typename T>
TestABC<T>::TestABC(const size_t platform_id, const size_t device_id,
TestABC<T>::TestABC(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda):
Tester<T>{platform_id, device_id, name, options},
Tester<T>{argc, argv, silent, name, options},
clblast_lambda_(clblast_lambda),
clblas_lambda_(clblas_lambda) {
@ -46,6 +46,7 @@ TestABC<T>::TestABC(const size_t platform_id, const size_t device_id,
// Tests the routine for a wide variety of parameters
template <typename T>
void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("regular behaviour", name);
// Computes whether or not the matrices are transposed. Note that we assume a default of
@ -133,7 +134,7 @@ void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
auto index = (args.layout == Layout::kRowMajor) ?
idm*args.c_ld + idn + args.c_offset:
idn*args.c_ld + idm + args.c_offset;
if (!TestSimilarity(r_result[index], s_result[index], kErrorMargin)) {
if (!TestSimilarity(r_result[index], s_result[index])) {
errors++;
}
}
@ -161,6 +162,7 @@ void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
// does not test for results (if any).
template <typename T>
void TestABC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("invalid buffer sizes", name);
// Sets example test parameters
@ -170,9 +172,12 @@ void TestABC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &n
args.a_ld = kBufferSize;
args.b_ld = kBufferSize;
args.c_ld = kBufferSize;
args.a_offset = 0;
args.b_offset = 0;
args.c_offset = 0;
// Iterates over test buffer sizes
const std::vector<size_t> kBufferSizes = {0, kBufferSize - 1, kBufferSize};
const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
for (auto &a_size: kBufferSizes) {
for (auto &b_size: kBufferSizes) {
for (auto &c_size: kBufferSizes) {

View file

@ -23,17 +23,6 @@
namespace clblast {
// =================================================================================================
// Defines the parameters that delineate individual test-cases
struct Parameters {
Layout layout;
Transpose a_transpose;
Transpose b_transpose;
std::string GetString() const {
return "Layout: "+ToString(layout)+", A: "+ToString(a_transpose)+
", B: "+ToString(b_transpose);
}
};
// See comment at top of file for a description of the class
template <typename T>
class TestABC: public Tester<T> {
@ -42,7 +31,8 @@ class TestABC: public Tester<T> {
// Uses several variables from the Tester class
using Tester<T>::context_;
using Tester<T>::queue_;
using Tester<T>::kErrorMargin;
using Tester<T>::kLayouts;
using Tester<T>::kTransposes;
// Uses several helper functions from the Tester class
using Tester<T>::TestStart;
@ -51,10 +41,12 @@ class TestABC: public Tester<T> {
using Tester<T>::TestErrorCount;
using Tester<T>::TestErrorCodes;
using Tester<T>::GetExampleScalars;
using Tester<T>::GetOffsets;
using Tester<T>::PrecisionSupported;
// Test settings for the regular test. Append to this list in case more tests are required.
const std::vector<size_t> kMatrixDims = { 7, 64 };
const std::vector<size_t> kOffsets = { 0 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<T> kAlphaValues = GetExampleScalars();
const std::vector<T> kBetaValues = GetExampleScalars();
@ -67,7 +59,7 @@ class TestABC: public Tester<T> {
CommandQueue&)>;
// Constructor, initializes the base class tester and input data
TestABC(const size_t platform_id, const size_t device_id,
TestABC(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda);

213
test/correctness/testaxy.cc Normal file
View file

@ -0,0 +1,213 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the TestAXY class (see the header for information about the class).
//
// =================================================================================================
#include <algorithm>
#include "correctness/testaxy.h"
namespace clblast {
// =================================================================================================
// Constructor, initializes the base class tester and input data
template <typename T>
TestAXY<T>::TestAXY(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda):
Tester<T>{argc, argv, silent, name, options},
clblast_lambda_(clblast_lambda),
clblas_lambda_(clblas_lambda) {
// Computes the maximum sizes. This allows for a single set of input/output buffers.
auto max_dim = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
auto max_ld = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
// Creates test input data
a_source_.resize(max_dim*max_ld + max_offset);
x_source_.resize(max_dim*max_inc + max_offset);
y_source_.resize(max_dim*max_inc + max_offset);
PopulateVector(a_source_);
PopulateVector(x_source_);
PopulateVector(y_source_);
}
// ===============================================================================================
// Tests the routine for a wide variety of parameters
template <typename T>
void TestAXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("regular behaviour", name);
// Iterates over the dimension for the matrix and vectors
for (auto &m: kMatrixVectorDims) {
args.m = m;
for (auto &n: kMatrixVectorDims) {
args.n = n;
// Computes the second dimension of the matrix taking the rotation into account
auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
// Computes the vector sizes in case the matrix is transposed
auto a_transposed = (args.a_transpose == Transpose::kYes);
auto m_real = (a_transposed) ? n : m;
auto n_real = (a_transposed) ? m : n;
// Iterates over the leading-dimension values and the offsets of the matrix
for (auto &a_ld: kMatrixVectorDims) {
args.a_ld = a_ld;
for (auto &a_offset: kOffsets) {
args.a_offset = a_offset;
// Iterates over the increment-values and the offsets of the vectors
for (auto &x_inc: kIncrements) {
args.x_inc = x_inc;
for (auto &x_offset: kOffsets) {
args.x_offset = x_offset;
for (auto &y_inc: kIncrements) {
args.y_inc = y_inc;
for (auto &y_offset: kOffsets) {
args.y_offset = y_offset;
// Computes the buffer sizes
auto a_size = a_two * a_ld + a_offset;
auto x_size = n_real * x_inc + x_offset;
auto y_size = m_real * y_inc + y_offset;
if (a_size < 1 || x_size < 1 || y_size < 1) { continue; }
// Creates the OpenCL buffers
auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
// Iterates over the values for alpha and beta
for (auto &alpha: kAlphaValues) {
args.alpha = alpha;
for (auto &beta: kBetaValues) {
args.beta = beta;
// Runs the reference clBLAS code
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
// Runs the CLBlast code
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
// Tests for equality of the two status codes
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
TestErrorCodes(status1, status2, args);
continue;
}
// Downloads the results
std::vector<T> r_result(y_size, static_cast<T>(0));
std::vector<T> s_result(y_size, static_cast<T>(0));
r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
// Checks for differences in the output
auto errors = size_t{0};
for (auto idm=size_t{0}; idm<m_real; ++idm) {
auto index = idm*y_inc + y_offset;
if (!TestSimilarity(r_result[index], s_result[index])) {
errors++;
}
}
// Tests the error count (should be zero)
TestErrorCount(errors, m_real, args);
}
}
}
}
}
}
}
}
}
}
TestEnd();
}
// =================================================================================================
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
// does not test for results (if any).
template <typename T>
void TestAXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("invalid buffer sizes", name);
// Sets example test parameters
args.m = kBufferSize;
args.n = kBufferSize;
args.a_ld = kBufferSize;
args.a_offset = 0;
args.x_offset = 0;
args.y_offset = 0;
// Iterates over test buffer sizes
const std::vector<size_t> kMatrixSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
const std::vector<size_t> kVectorSizes = {0, kBufferSize - 1, kBufferSize};
for (auto &a_size: kMatrixSizes) {
for (auto &x_size: kVectorSizes) {
for (auto &y_size: kVectorSizes) {
// Iterates over test increments
for (auto &x_inc: kInvalidIncrements) {
args.x_inc = x_inc;
for (auto &y_inc: kInvalidIncrements) {
args.y_inc = y_inc;
// Creates the OpenCL buffers. Note: we are not using the C++ version since we
// explicitly want to be able to create invalid buffers (no error checking here).
auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
auto a_mat = Buffer(a);
auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
auto x_vec = Buffer(x);
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
auto r_vec = Buffer(r);
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
auto s_vec = Buffer(s);
// Runs the two routines
auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
// Tests for equality of the two status codes
TestErrorCodes(status1, status2, args);
}
}
}
}
}
TestEnd();
}
// =================================================================================================
// Compiles the templated class
template class TestAXY<float>;
template class TestAXY<double>;
template class TestAXY<float2>;
template class TestAXY<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,88 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file tests any mat-vec-vec (A,X,Y) routine. It contains two types of tests: one testing
// all sorts of input combinations, and one deliberatly testing with invalid values.
//
// =================================================================================================
#ifndef CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
#define CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
#include <vector>
#include <string>
#include "correctness/tester.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestAXY: public Tester<T> {
public:
// Uses several variables from the Tester class
using Tester<T>::context_;
using Tester<T>::queue_;
using Tester<T>::kLayouts;
using Tester<T>::kTransposes;
// Uses several helper functions from the Tester class
using Tester<T>::TestStart;
using Tester<T>::TestEnd;
using Tester<T>::TestSimilarity;
using Tester<T>::TestErrorCount;
using Tester<T>::TestErrorCodes;
using Tester<T>::GetExampleScalars;
using Tester<T>::GetOffsets;
using Tester<T>::PrecisionSupported;
// Test settings for the regular test. Append to this list in case more tests are required.
const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<size_t> kIncrements = { 1, 2 };
const std::vector<T> kAlphaValues = GetExampleScalars();
const std::vector<T> kBetaValues = GetExampleScalars();
// Test settings for the invalid test
const std::vector<size_t> kInvalidIncrements = { 0, 1 };
const size_t kBufferSize = 64;
// Shorthand for a BLAS routine
using Routine = std::function<StatusCode(const Arguments<T>&,
const Buffer&, const Buffer&, const Buffer&,
CommandQueue&)>;
// Constructor, initializes the base class tester and input data
TestAXY(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda);
// The test functions, taking no inputs
void TestRegular(Arguments<T> &args, const std::string &name);
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
private:
// Source data to test with
std::vector<T> a_source_;
std::vector<T> x_source_;
std::vector<T> y_source_;
// The routines to test
Routine clblast_lambda_;
Routine clblas_lambda_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
#endif

View file

@ -17,33 +17,58 @@
#include <vector>
#include <iostream>
#include <cmath>
#include <limits>
namespace clblast {
// =================================================================================================
// The layouts and transpose-options to test with (data-type dependent)
template <typename T>
const std::vector<Layout> Tester<T>::kLayouts = {Layout::kRowMajor, Layout::kColMajor};
template <> const std::vector<Transpose> Tester<float>::kTransposes = {Transpose::kNo, Transpose::kYes};
template <> const std::vector<Transpose> Tester<double>::kTransposes = {Transpose::kNo, Transpose::kYes};
template <> const std::vector<Transpose> Tester<float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
template <> const std::vector<Transpose> Tester<double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
// =================================================================================================
// General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
// the clBLAS library for reference.
template <typename T>
Tester<T>::Tester(const size_t platform_id, const size_t device_id,
Tester<T>::Tester(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options):
platform_(Platform(platform_id)),
device_(Device(platform_, kDeviceType, device_id)),
help_("Options given/available:\n"),
platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))),
device_(Device(platform_, kDeviceType, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))),
context_(Context(device_)),
queue_(CommandQueue(context_, device_)),
full_test_(CheckArgument(argc, argv, help_, kArgFullTest)),
error_log_{},
num_passed_{0},
num_skipped_{0},
num_errors_{0},
num_failed_{0},
print_count_{0},
tests_failed_{0},
tests_passed_{0},
tests_skipped_{0},
tests_failed_{0},
options_{options} {
// Prints the help message (command-line arguments)
if (!silent) { fprintf(stdout, "\n* %s\n", help_.c_str()); }
// Prints the header
fprintf(stdout, "* Running on OpenCL device '%s'.\n", device_.Name().c_str());
fprintf(stdout, "* Starting tests for the %s'%s'%s routine. Legend:\n",
fprintf(stdout, "* Starting tests for the %s'%s'%s routine.",
kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str());
// Checks whether the precision is supported
if (!PrecisionSupported()) {
fprintf(stdout, "\n* All tests skipped: %sUnsupported precision%s\n",
kPrintWarning.c_str(), kPrintEnd.c_str());
return;
}
// Prints the legend
fprintf(stdout, " Legend:\n");
fprintf(stdout, " %s -> Test produced correct results\n", kSuccessData.c_str());
fprintf(stdout, " %s -> Test returned the correct error code\n", kSuccessStatus.c_str());
fprintf(stdout, " %s -> Test produced incorrect results\n", kErrorData.c_str());
@ -63,14 +88,13 @@ Tester<T>::Tester(const size_t platform_id, const size_t device_id,
// Destructor prints the summary of the test cases and cleans-up the clBLAS library
template <typename T>
Tester<T>::~Tester() {
fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
fprintf(stdout, " %lu test(s) succeeded\n", tests_passed_);
if (tests_failed_ != 0) {
fprintf(stdout, " %s%lu test(s) failed%s\n",
kPrintError.c_str(), tests_failed_, kPrintEnd.c_str());
}
else {
fprintf(stdout, " %lu test(s) failed\n", tests_failed_);
if (PrecisionSupported()) {
fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
fprintf(stdout, " %lu test(s) passed\n", tests_passed_);
if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
fprintf(stdout, " %lu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str());
if (tests_failed_ > 0) { fprintf(stdout, "%s", kPrintError.c_str()); }
fprintf(stdout, " %lu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
}
fprintf(stdout, "\n");
clblasTeardown();
@ -93,7 +117,7 @@ void Tester<T>::TestStart(const std::string &test_name, const std::string &test_
error_log_.clear();
num_passed_ = 0;
num_skipped_ = 0;
num_errors_ = 0;
num_failed_ = 0;
print_count_ = 0;
}
@ -102,7 +126,9 @@ void Tester<T>::TestStart(const std::string &test_name, const std::string &test_
template <typename T>
void Tester<T>::TestEnd() {
fprintf(stdout, "\n");
if (error_log_.size() == 0) { tests_passed_++; } else { tests_failed_++; }
tests_passed_ += num_passed_;
tests_failed_ += num_skipped_;
tests_failed_ += num_failed_;
// Prints details of all error occurences for these tests
for (auto &entry: error_log_) {
@ -136,7 +162,7 @@ void Tester<T>::TestEnd() {
}
// Prints a test summary
auto pass_rate = 100*num_passed_ / static_cast<float>(num_passed_ + num_skipped_ + num_errors_);
auto pass_rate = 100*num_passed_ / static_cast<float>(num_passed_ + num_skipped_ + num_failed_);
fprintf(stdout, " Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str());
fprintf(stdout, " %lu passed /", num_passed_);
if (num_skipped_ != 0) {
@ -145,11 +171,11 @@ void Tester<T>::TestEnd() {
else {
fprintf(stdout, " %lu skipped /", num_skipped_);
}
if (num_errors_ != 0) {
fprintf(stdout, " %s%lu failed%s\n", kPrintError.c_str(), num_errors_, kPrintEnd.c_str());
if (num_failed_ != 0) {
fprintf(stdout, " %s%lu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str());
}
else {
fprintf(stdout, " %lu failed\n", num_errors_);
fprintf(stdout, " %lu failed\n", num_failed_);
}
}
@ -158,34 +184,34 @@ void Tester<T>::TestEnd() {
// Compares two floating point values and returns whether they are within an acceptable error
// margin. This replaces GTest's EXPECT_NEAR().
template <typename T>
bool Tester<T>::TestSimilarity(const T val1, const T val2, const double margin) {
bool Tester<T>::TestSimilarity(const T val1, const T val2) {
const auto difference = std::fabs(val1 - val2);
// Shortcut, handles infinities
if (val1 == val2) {
return true;
}
// The values are zero or both are extremely close to it relative error is less meaningful
else if (val1 == 0 || val2 == 0 || difference < std::numeric_limits<T>::min()) {
return difference < (static_cast<T>(margin) * std::numeric_limits<T>::min());
// The values are zero or very small: the relative error is less meaningful
else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
return (difference < static_cast<T>(kErrorMarginAbsolute));
}
// Use relative error
else {
return (difference / (std::fabs(val1) + std::fabs(val2))) < static_cast<T>(margin);
return (difference / (std::fabs(val1)+std::fabs(val2))) < static_cast<T>(kErrorMarginRelative);
}
}
// Specialisations for complex data-types
template <>
bool Tester<float2>::TestSimilarity(const float2 val1, const float2 val2, const double margin) {
auto real = Tester<float>::TestSimilarity(val1.real(), val2.real(), margin);
auto imag = Tester<float>::TestSimilarity(val1.imag(), val2.imag(), margin);
bool Tester<float2>::TestSimilarity(const float2 val1, const float2 val2) {
auto real = Tester<float>::TestSimilarity(val1.real(), val2.real());
auto imag = Tester<float>::TestSimilarity(val1.imag(), val2.imag());
return (real && imag);
}
template <>
bool Tester<double2>::TestSimilarity(const double2 val1, const double2 val2, const double margin) {
auto real = Tester<double>::TestSimilarity(val1.real(), val2.real(), margin);
auto imag = Tester<double>::TestSimilarity(val1.imag(), val2.imag(), margin);
bool Tester<double2>::TestSimilarity(const double2 val1, const double2 val2) {
auto real = Tester<double>::TestSimilarity(val1.real(), val2.real());
auto imag = Tester<double>::TestSimilarity(val1.imag(), val2.imag());
return (real && imag);
}
@ -248,19 +274,43 @@ void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode
// routines. This function is specialised for the different data-types.
template <>
const std::vector<float> Tester<float>::GetExampleScalars() {
return {0.0f, 1.0f, 3.14f};
if (full_test_) { return {0.0f, 1.0f, 3.14f}; }
else { return {3.14f}; }
}
template <>
const std::vector<double> Tester<double>::GetExampleScalars() {
return {0.0, 1.0, 3.14};
if (full_test_) { return {0.0, 1.0, 3.14}; }
else { return {3.14}; }
}
template <>
const std::vector<float2> Tester<float2>::GetExampleScalars() {
return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}};
if (full_test_) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
else { return {{2.42f, 3.14f}}; }
}
template <>
const std::vector<double2> Tester<double2>::GetExampleScalars() {
return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}};
if (full_test_) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
else { return {{2.42, 3.14}}; }
}
// Retrieves the offset values to test with
template <typename T>
const std::vector<size_t> Tester<T>::GetOffsets() {
if (full_test_) { return {0, 10}; }
else { return {0}; }
}
// =================================================================================================
template <> bool Tester<float>::PrecisionSupported() const { return true; }
template <> bool Tester<float2>::PrecisionSupported() const { return true; }
template <> bool Tester<double>::PrecisionSupported() const {
auto extensions = device_.Extensions();
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
}
template <> bool Tester<double2>::PrecisionSupported() const {
auto extensions = device_.Extensions();
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
}
// =================================================================================================
@ -277,7 +327,7 @@ void Tester<T>::ReportSkipped() {
template <typename T>
void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
error_log_.push_back(error_log_entry);
num_errors_++;
num_failed_++;
}
// =================================================================================================

View file

@ -44,7 +44,8 @@ class Tester {
static constexpr auto kStatusError = -1.0f;
// Set the allowed error margin for floating-point comparisons
static constexpr auto kErrorMargin = 1.0e-2;
static constexpr auto kErrorMarginRelative = 1.0e-2;
static constexpr auto kErrorMarginAbsolute = 1.0e-10;
// Constants holding start and end strings for terminal-output in colour
const std::string kPrintError{"\x1b[31m"};
@ -61,6 +62,10 @@ class Tester {
const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};
// The layouts and transpose-options to test with
static const std::vector<Layout> kLayouts;
static const std::vector<Transpose> kTransposes;
// This structure combines the above log-entry with a status code an error percentage
struct ErrorLogEntry {
StatusCode status_expect;
@ -71,7 +76,7 @@ class Tester {
// Creates an instance of the tester, running on a particular OpenCL platform and device. It
// takes the routine's names as an additional parameter.
explicit Tester(const size_t platform_id, const size_t device_id,
explicit Tester(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options);
~Tester();
@ -80,7 +85,7 @@ class Tester {
void TestEnd();
// Compares two floating point values for similarity. Allows for a certain relative error margin.
static bool TestSimilarity(const T val1, const T val2, const double margin);
static bool TestSimilarity(const T val1, const T val2);
// Tests either an error count (should be zero) or two error codes (must match)
void TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args);
@ -92,6 +97,15 @@ class Tester {
// Retrieves a list of example scalars of the right type
const std::vector<T> GetExampleScalars();
// Retrieves a list of offset values to test
const std::vector<size_t> GetOffsets();
// Returns false is this precision is not supported by the device
bool PrecisionSupported() const;
// The help-message
std::string help_;
// The OpenCL objects (accessible by derived classes)
Platform platform_;
Device device_;
@ -108,18 +122,22 @@ class Tester {
// Prints the error or success symbol to screen
void PrintTestResult(const std::string &message);
// Whether or not to run the full test-suite or just a smoke test
bool full_test_;
// Logging and counting occurrences of errors
std::vector<ErrorLogEntry> error_log_;
size_t num_passed_;
size_t num_skipped_;
size_t num_errors_;
size_t num_failed_;
// Counting the amount of errors printed on this row
size_t print_count_;
// Counting the number of test-cases with and without failures
size_t tests_failed_;
size_t tests_passed_;
size_t tests_skipped_;
size_t tests_failed_;
// Arguments relevant for a specific routine
std::vector<std::string> options_;

View file

@ -20,10 +20,10 @@ namespace clblast {
// Constructor, initializes the base class tester and input data
template <typename T>
TestXY<T>::TestXY(const size_t platform_id, const size_t device_id,
TestXY<T>::TestXY(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda):
Tester<T>{platform_id, device_id, name, options},
Tester<T>{argc, argv, silent, name, options},
clblast_lambda_(clblast_lambda),
clblas_lambda_(clblas_lambda) {
@ -44,6 +44,7 @@ TestXY<T>::TestXY(const size_t platform_id, const size_t device_id,
// Tests the routine for a wide variety of parameters
template <typename T>
void TestXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("regular behaviour", name);
// Iterates over the vector dimension
@ -100,7 +101,7 @@ void TestXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
auto errors = size_t{0};
for (auto idn=size_t{0}; idn<n; ++idn) {
auto index = idn*y_inc + y_offset;
if (!TestSimilarity(r_result[index], s_result[index], kErrorMargin)) {
if (!TestSimilarity(r_result[index], s_result[index])) {
errors++;
}
}
@ -122,10 +123,13 @@ void TestXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
// does not test for results (if any).
template <typename T>
void TestXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("invalid buffer sizes", name);
// Sets example test parameters
args.n = kBufferSize;
args.x_offset = 0;
args.y_offset = 0;
// Iterates over test buffer sizes
const std::vector<size_t> kBufferSizes = {0, kBufferSize - 1, kBufferSize};

View file

@ -31,7 +31,6 @@ class TestXY: public Tester<T> {
// Uses several variables from the Tester class
using Tester<T>::context_;
using Tester<T>::queue_;
using Tester<T>::kErrorMargin;
// Uses several helper functions from the Tester class
using Tester<T>::TestStart;
@ -40,11 +39,13 @@ class TestXY: public Tester<T> {
using Tester<T>::TestErrorCount;
using Tester<T>::TestErrorCodes;
using Tester<T>::GetExampleScalars;
using Tester<T>::GetOffsets;
using Tester<T>::PrecisionSupported;
// Test settings for the regular test. Append to this list in case more tests are required.
const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
const std::vector<size_t> kOffsets = { 0, 10 };
const std::vector<size_t> kIncrements = { 1, 2 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<size_t> kIncrements = { 1, 2, 7 };
const std::vector<T> kAlphaValues = GetExampleScalars();
// Test settings for the invalid test
@ -57,7 +58,7 @@ class TestXY: public Tester<T> {
CommandQueue&)>;
// Constructor, initializes the base class tester and input data
TestXY(const size_t platform_id, const size_t device_id,
TestXY(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda);

View file

@ -26,8 +26,12 @@ template <typename T>
void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
const std::vector<std::string> &options) {
// Function to determine how to find the default value of the leading dimension of matrix A.
// Note: this is not relevant for this client but given anyway.
auto default_ld_a = [](const Arguments<T> args) { return args.n; };
// Simple command line argument parser with defaults
auto args = ParseArguments<T>(argc, argv, options);
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
if (args.print_help) { return; }
// Prints the header of the output table
@ -81,13 +85,94 @@ template void ClientXY<double2>(int, char **, Routine2<double2>, const std::vect
// =================================================================================================
// This is the matrix-vector-vector variant of the set-up/tear-down client routine.
template <typename T>
void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options) {
// Function to determine how to find the default value of the leading dimension of matrix A
auto default_ld_a = [](const Arguments<T> args) { return args.n; };
// Simple command line argument parser with defaults
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
if (args.print_help) { return; }
// Prints the header of the output table
PrintTableHeader(args.silent, options);
// Initializes OpenCL and the libraries
auto platform = Platform(args.platform_id);
auto device = Device(platform, kDeviceType, args.device_id);
auto context = Context(device);
auto queue = CommandQueue(context, device);
if (args.compare_clblas) { clblasSetup(); }
// Iterates over all "num_step" values jumping by "step" each time
auto s = size_t{0};
while(true) {
// Computes the second dimension of the matrix taking the rotation into account
auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
// Computes the vector sizes in case the matrix is transposed
auto a_transposed = (args.a_transpose == Transpose::kYes);
auto m_real = (a_transposed) ? args.n : args.m;
auto n_real = (a_transposed) ? args.m : args.n;
// Computes the data sizes
auto a_size = a_two * args.a_ld + args.a_offset;
auto x_size = n_real*args.x_inc + args.x_offset;
auto y_size = m_real*args.y_inc + args.y_offset;
// Populates input host vectors with random data
std::vector<T> a_source(a_size);
std::vector<T> x_source(x_size);
std::vector<T> y_source(y_size);
PopulateVector(a_source);
PopulateVector(x_source);
PopulateVector(y_source);
// Creates the vectors on the device
auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
// Runs the routine-specific code
client_routine(args, a_buffer, x_buffer, y_buffer, queue);
// Makes the jump to the next step
++s;
if (s >= args.num_steps) { break; }
args.m += args.step;
args.n += args.step;
args.a_ld += args.step;
}
// Cleans-up and returns
if (args.compare_clblas) { clblasTeardown(); }
}
// Compiles the above function
template void ClientAXY<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
template void ClientAXY<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
template void ClientAXY<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
template void ClientAXY<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
// =================================================================================================
// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
template <typename T>
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options) {
// Function to determine how to find the default value of the leading dimension of matrix A
auto default_ld_a = [](const Arguments<T> args) { return args.m; };
// Simple command line argument parser with defaults
auto args = ParseArguments<T>(argc, argv, options);
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
if (args.print_help) { return; }
// Prints the header of the output table
@ -167,7 +252,8 @@ template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vec
// applicable, but are searched for anyway to be able to create one common argument parser. All
// arguments have a default value in case they are not found.
template <typename T>
Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options) {
Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
const std::function<size_t(const Arguments<T>)> default_ld_a) {
auto args = Arguments<T>{};
auto help = std::string{"Options given/available:\n"};
@ -193,7 +279,7 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); }
// Matrix arguments
if (o == kArgALeadDim) { args.a_ld = GetArgument(argc, argv, help, kArgALeadDim, args.k); }
if (o == kArgALeadDim) { args.a_ld = GetArgument(argc, argv, help, kArgALeadDim, default_ld_a(args)); }
if (o == kArgBLeadDim) { args.b_ld = GetArgument(argc, argv, help, kArgBLeadDim, args.n); }
if (o == kArgCLeadDim) { args.c_ld = GetArgument(argc, argv, help, kArgCLeadDim, args.n); }
if (o == kArgAOffset) { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }

View file

@ -49,6 +49,9 @@ template <typename T>
void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
const std::vector<std::string> &options);
template <typename T>
void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options);
template <typename T>
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options);
@ -57,7 +60,8 @@ void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
// Parses all command-line arguments, filling in the arguments structure. If no command-line
// argument is given for a particular argument, it is filled in with a default value.
template <typename T>
Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options);
Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
const std::function<size_t(const Arguments<T>)> default_ld_a);
// Retrieves only the precision command-line argument, since the above function is templated based
// on the precision

View file

@ -82,6 +82,7 @@ main <- function(routine_name, precision, test_names, test_values,
# Runs the client and captures the result
params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
arguments <- paste(devices_string, params_string, options_string, sep=" ")
print(paste("Running", executable, arguments, sep=" "))
result_string <- system2(command=executable, args=arguments, stdout=TRUE)
# Reads the result into a dataframe

View file

@ -0,0 +1,83 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xgemv routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xgemv"
parameters <- c("-n","-m","-incx","-incy","-layout",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 256",
"multiples of 256 (+1)",
"around n=m=2K",
"multiples of 256 [rotated]",
"multiples of 256 (+1) [rotated]",
"strides (n=2K)"
)
# Defines the test-cases
test_values <- list(
list(c(256, 256, 1, 1, 1, 16, 256, num_runs, precision)),
list(c(256+1, 256+1, 1, 1, 1, 16, 256, num_runs, precision)),
list(c(2*kilo, 2*kilo, 1, 1, 1, 16, 1, num_runs, precision)),
list(c(256, 256, 1, 1, 0, 16, 256, num_runs, precision)),
list(c(256+1, 256+1, 1, 1, 0, 16, 256, num_runs, precision)),
list(
c(2*kilo, 2*kilo, 1, 1, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 2, 1, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 4, 1, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 8, 1, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 2, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 4, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 8, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 2, 2, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 4, 4, 1, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 8, 8, 1, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"vector sizes (n)",
"vector sizes (n)",
"vector sizes (n)",
"vector sizes (n)",
"vector sizes (n)",
"increments/strides for x and y"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("n", ""),
c("n", ""),
c("n", ""),
c("n", ""),
c("n", ""),
list(1:10, c("x1y1", "x2y1", "x4y1", "x8y1", "x1y2", "x1y4", "x1y8", "x2y2", "x4y4", "x8y8"))
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=FALSE)
# ==================================================================================================

View file

@ -0,0 +1,107 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xgemv command-line interface tester.
//
// =================================================================================================
#include <string>
#include <vector>
#include <exception>
#include "wrapper_clblas.h"
#include "performance/client.h"
namespace clblast {
// =================================================================================================
// The client, used for performance testing. It contains the function calls to CLBlast and to other
// libraries to compare against.
template <typename T>
void PerformanceXgemv(const Arguments<T> &args,
const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
CommandQueue &queue) {
// Creates the CLBlast lambda
auto clblast_lambda = [&args, &a_mat, &x_vec, &y_vec, &queue]() {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha,
a_mat(), args.a_offset, args.a_ld,
x_vec(), args.x_offset, args.x_inc, args.beta,
y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
clWaitForEvents(1, &event);
if (status != StatusCode::kSuccess) {
throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
}
};
// Creates the clBLAS lambda (for comparison)
auto clblas_lambda = [&args, &a_mat, &x_vec, &y_vec, &queue]() {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
static_cast<clblasTranspose>(args.a_transpose),
args.m, args.n, args.alpha,
a_mat(), args.a_offset, args.a_ld,
x_vec(), args.x_offset, args.x_inc, args.beta,
y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
if (status != CL_SUCCESS) {
throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
}
};
// Runs the routines and collect the timings
auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
// Prints the performance of both libraries
const auto flops = 2 * args.m * args.n;
const auto bytes = (args.m*args.n + 2*args.m + args.n) * sizeof(T);
const auto output_ints = std::vector<size_t>{args.m, args.n,
static_cast<size_t>(args.layout),
static_cast<size_t>(args.a_transpose),
args.a_ld, args.x_inc, args.y_inc,
args.a_offset, args.x_offset, args.y_offset};
const auto output_strings = std::vector<std::string>{ToString(args.alpha),
ToString(args.beta)};
PrintTableRow(output_ints, output_strings, args.no_abbrv,
ms_clblast, ms_clblas, flops, bytes);
}
// =================================================================================================
// Main function which calls the common client code with the routine-specific function as argument.
void ClientXgemv(int argc, char *argv[]) {
const auto o = std::vector<std::string>{kArgM, kArgN, kArgLayout, kArgATransp,
kArgALeadDim, kArgXInc, kArgYInc,
kArgAOffset, kArgXOffset, kArgYOffset,
kArgAlpha, kArgBeta};
switch(GetPrecision(argc, argv)) {
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case Precision::kSingle: ClientAXY<float>(argc, argv, PerformanceXgemv<float>, o); break;
case Precision::kDouble: ClientAXY<double>(argc, argv, PerformanceXgemv<double>, o); break;
case Precision::kComplexSingle: ClientAXY<float2>(argc, argv, PerformanceXgemv<float2>, o); break;
case Precision::kComplexDouble: ClientAXY<double2>(argc, argv, PerformanceXgemv<double2>, o); break;
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::ClientXgemv(argc, argv);
return 0;
}
// =================================================================================================

View file

@ -74,6 +74,64 @@ clblasStatus clblasXaxpy(
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// Calls {clblasSgemv, clblasDgemv, clblasCgemv, clblasZgemv} with the arguments forwarded.
clblasStatus clblasXgemv(
clblasOrder layout, clblasTranspose tran_a, size_t m, size_t n, float alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld,
const cl_mem x_vec, size_t x_offset, size_t x_inc, float beta,
const cl_mem y_vec, size_t y_offset, size_t y_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
return clblasSgemv(layout, tran_a, m, n, alpha,
a_mat, a_offset, a_ld,
x_vec, x_offset, static_cast<int>(x_inc), beta,
y_vec, y_offset, static_cast<int>(y_inc),
num_queues, queues, num_wait_events, wait_events, events);
}
clblasStatus clblasXgemv(
clblasOrder layout, clblasTranspose tran_a, size_t m, size_t n, double alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld,
const cl_mem x_vec, size_t x_offset, size_t x_inc, double beta,
const cl_mem y_vec, size_t y_offset, size_t y_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
return clblasDgemv(layout, tran_a, m, n, alpha,
a_mat, a_offset, a_ld,
x_vec, x_offset, static_cast<int>(x_inc), beta,
y_vec, y_offset, static_cast<int>(y_inc),
num_queues, queues, num_wait_events, wait_events, events);
}
clblasStatus clblasXgemv(
clblasOrder layout, clblasTranspose tran_a, size_t m, size_t n, float2 alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld,
const cl_mem x_vec, size_t x_offset, size_t x_inc, float2 beta,
const cl_mem y_vec, size_t y_offset, size_t y_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
return clblasCgemv(layout, tran_a, m, n, cl_alpha,
a_mat, a_offset, a_ld,
x_vec, x_offset, static_cast<int>(x_inc), cl_beta,
y_vec, y_offset, static_cast<int>(y_inc),
num_queues, queues, num_wait_events, wait_events, events);
}
clblasStatus clblasXgemv(
clblasOrder layout, clblasTranspose tran_a, size_t m, size_t n, double2 alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld,
const cl_mem x_vec, size_t x_offset, size_t x_inc, double2 beta,
const cl_mem y_vec, size_t y_offset, size_t y_inc,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
return clblasZgemv(layout, tran_a, m, n, cl_alpha,
a_mat, a_offset, a_ld,
x_vec, x_offset, static_cast<int>(x_inc), cl_beta,
y_vec, y_offset, static_cast<int>(y_inc),
num_queues, queues, num_wait_events, wait_events, events);
}
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines