From ea7da6a49758af50302be040ab7a97a7a8c0f692 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sun, 21 Jun 2015 11:21:03 +0200 Subject: [PATCH 01/15] Fixed support for complex data-types for GEMM and SYMM clients --- test/performance/routines/xgemm.cc | 4 ++-- test/performance/routines/xsymm.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/performance/routines/xgemm.cc b/test/performance/routines/xgemm.cc index 234e9fdb..adff85c8 100644 --- a/test/performance/routines/xgemm.cc +++ b/test/performance/routines/xgemm.cc @@ -98,8 +98,8 @@ void ClientXgemm(int argc, char *argv[]) { case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case Precision::kSingle: ClientABC(argc, argv, PerformanceXgemm, o); break; case Precision::kDouble: ClientABC(argc, argv, PerformanceXgemm, o); break; - case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + case Precision::kComplexSingle: ClientABC(argc, argv, PerformanceXgemm, o); break; + case Precision::kComplexDouble: ClientABC(argc, argv, PerformanceXgemm, o); break; } } diff --git a/test/performance/routines/xsymm.cc b/test/performance/routines/xsymm.cc index 13ad434a..3c9feb05 100644 --- a/test/performance/routines/xsymm.cc +++ b/test/performance/routines/xsymm.cc @@ -98,8 +98,8 @@ void ClientXsymm(int argc, char *argv[]) { case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case Precision::kSingle: ClientABC(argc, argv, PerformanceXsymm, o); break; case Precision::kDouble: ClientABC(argc, argv, PerformanceXsymm, o); break; - case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + case Precision::kComplexSingle: ClientABC(argc, argv, PerformanceXsymm, o); break; + case Precision::kComplexDouble: ClientABC(argc, argv, PerformanceXsymm, o); break; } } From e3829c1067814c0aa83ab440fa431d98837aeeda Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sun, 21 Jun 2015 12:44:03 +0200 Subject: [PATCH 02/15] Added prototypes of SYRK and SYR2K --- include/clblast.h | 25 ++++++++- src/clblast.cc | 122 +++++++++++++++++++++++++++++++++++++++- test/wrapper_clblas.h | 126 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 269 insertions(+), 4 deletions(-) diff --git a/include/clblast.h b/include/clblast.h index 231348b8..da504a0b 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -107,7 +107,7 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a, // ================================================================================================= // BLAS level-3 (matrix-matrix) routines -// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM +// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM template StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b, const size_t m, const size_t n, const size_t k, @@ -118,7 +118,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM +// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -129,6 +129,27 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); +// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose transpose_a, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); + +// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose transpose_ab, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= } // namespace clblast diff --git a/src/clblast.cc b/src/clblast.cc index bb0091a3..e0d085a9 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -209,7 +209,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, std::string kernel_source = #include "kernels/xgemm.opencl" auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 + - kernel_source); + kernel_source); if (status != StatusCode::kSuccess) { return status; } // Runs the routine @@ -243,5 +243,125 @@ template StatusCode Symm(const Layout, const Side, const Triangle, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// ================================================================================================= + +// SYRK +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose transpose_a, + const size_t n, const size_t k, const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = CommandQueue(*queue); + auto event_cpp = Event(*event); + /* + auto routine = Xsyrk(queue_cpp, event_cpp); + + // Loads the kernel source-code as an include (C++11 raw string literal) + std::string common_source1 = + #include "kernels/copy.opencl" + std::string common_source2 = + #include "kernels/pad.opencl" + std::string common_source3 = + #include "kernels/transpose.opencl" + std::string common_source4 = + #include "kernels/padtranspose.opencl" + std::string kernel_source = + #include "kernels/xgemm.opencl" + auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 + + kernel_source); + if (status != StatusCode::kSuccess) { return status; } + + // Runs the routine + return routine.DoSyrk(layout, triangle, transpose_a, n, k, alpha, + Buffer(a_buffer), a_offset, a_ld, beta, + Buffer(c_buffer), c_offset, c_ld); + */ + return StatusCode::kSuccess; +} +template StatusCode Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// ================================================================================================= + +// SYR2K +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose transpose_ab, + const size_t n, const size_t k, const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = CommandQueue(*queue); + auto event_cpp = Event(*event); + /* + auto routine = Xsyr2k(queue_cpp, event_cpp); + + // Loads the kernel source-code as an include (C++11 raw string literal) + std::string common_source1 = + #include "kernels/copy.opencl" + std::string common_source2 = + #include "kernels/pad.opencl" + std::string common_source3 = + #include "kernels/transpose.opencl" + std::string common_source4 = + #include "kernels/padtranspose.opencl" + std::string kernel_source = + #include "kernels/xgemm.opencl" + auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 + + kernel_source); + if (status != StatusCode::kSuccess) { return status; } + + // Runs the routine + return routine.DoSyr2k(layout, triangle, transpose_ab, n, k, alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(c_buffer), c_offset, c_ld); + */ + return StatusCode::kSuccess; +} +template StatusCode Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= } // namespace clblast diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 093a8742..d6df0835 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -201,7 +201,7 @@ clblasStatus clblasXgemm( num_queues, queues, num_wait_events, wait_events, events); } -// This calls {clblasSsymm, clblasDsymm} with the arguments forwarded. +// This calls {clblasSsymm, clblasDsymm, clblasCsymm, clblasZsymm} with the arguments forwarded. clblasStatus clblasXsymm( clblasOrder layout, clblasSide side, clblasUplo triangle, size_t m, size_t n, float alpha, @@ -267,6 +267,130 @@ clblasStatus clblasXsymm( num_queues, queues, num_wait_events, wait_events, events); } +// This calls {clblasSsyrk, clblasDsyrk, clblasCsyrk, clblasZsyrk} with the arguments forwarded. +clblasStatus clblasXsyrk( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a, + size_t n, size_t k, float alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyrk(layout, triangle, tran_a, + n, k, alpha, + a_mat, a_offset, a_ld, beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyrk( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a, + size_t n, size_t k, double alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyrk(layout, triangle, tran_a, + n, k, alpha, + a_mat, a_offset, a_ld, beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyrk( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a, + size_t n, size_t k, float2 alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, float2 beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; + auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; + return clblasCsyrk(layout, triangle, tran_a, + n, k, cl_alpha, + a_mat, a_offset, a_ld, cl_beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyrk( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a, + size_t n, size_t k, double2 alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, double2 beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; + auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; + return clblasZsyrk(layout, triangle, tran_a, + n, k, cl_alpha, + a_mat, a_offset, a_ld, cl_beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// This calls {clblasSsyr2k, clblasDsyr2k, clblasCsyr2k, clblasZsyr2k} with the arguments forwarded. +clblasStatus clblasXsyr2k( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab, + size_t n, size_t k, float alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, + const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyr2k(layout, triangle, tran_ab, + n, k, alpha, + a_mat, a_offset, a_ld, + b_mat, b_offset, b_ld, beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2k( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab, + size_t n, size_t k, double alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, + const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyr2k(layout, triangle, tran_ab, + n, k, alpha, + a_mat, a_offset, a_ld, + b_mat, b_offset, b_ld, beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2k( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab, + size_t n, size_t k, float2 alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, + const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; + auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; + return clblasCsyr2k(layout, triangle, tran_ab, + n, k, cl_alpha, + a_mat, a_offset, a_ld, + b_mat, b_offset, b_ld, cl_beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2k( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab, + size_t n, size_t k, double2 alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, + const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; + auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; + return clblasZsyr2k(layout, triangle, tran_ab, + n, k, cl_alpha, + a_mat, a_offset, a_ld, + b_mat, b_offset, b_ld, cl_beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + // ================================================================================================= } // namespace clblast From 4c2a166bc5406b194108d3b31238e55ac6b99e3c Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sun, 21 Jun 2015 12:57:38 +0200 Subject: [PATCH 03/15] Added test infrastructure for AB and AC routines --- CMakeLists.txt | 24 ++++- test/correctness/testab.cc | 192 +++++++++++++++++++++++++++++++++++++ test/correctness/testab.h | 85 ++++++++++++++++ test/correctness/testac.cc | 191 ++++++++++++++++++++++++++++++++++++ test/correctness/testac.h | 85 ++++++++++++++++ 5 files changed, 572 insertions(+), 5 deletions(-) create mode 100644 test/correctness/testab.cc create mode 100644 test/correctness/testab.h create mode 100644 test/correctness/testac.cc create mode 100644 test/correctness/testac.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 038e71ae..b84ed62b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,7 +98,9 @@ set(SAMPLE_PROGRAMS sgemm) set(ROUTINES_XY xaxpy) set(ROUTINES_AXY xgemv) set(ROUTINES_ABC xgemm xsymm) -set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC}) +set(ROUTINES_AB ) +set(ROUTINES_AC ) +set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC} ${ROUTINES_AB} ${ROUTINES_AC}) # ================================================================================================== @@ -172,6 +174,8 @@ if(TESTS) add_library(test_correctness_xy OBJECT test/correctness/testxy.cc) add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc) add_library(test_correctness_abc OBJECT test/correctness/testabc.cc) + add_library(test_correctness_ab OBJECT test/correctness/testab.cc) + add_library(test_correctness_ac OBJECT test/correctness/testac.cc) # Compiles the correctness-tests foreach(ROUTINE ${ROUTINES_XY}) @@ -179,22 +183,32 @@ if(TESTS) $ $ test/correctness/routines/${ROUTINE}.cc) - target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES}) - install(TARGETS test_${ROUTINE} DESTINATION bin) endforeach() foreach(ROUTINE ${ROUTINES_AXY}) add_executable(test_${ROUTINE} $ $ test/correctness/routines/${ROUTINE}.cc) - target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES}) - install(TARGETS test_${ROUTINE} DESTINATION bin) endforeach() foreach(ROUTINE ${ROUTINES_ABC}) add_executable(test_${ROUTINE} $ $ test/correctness/routines/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${ROUTINES_AB}) + add_executable(test_${ROUTINE} + $ + $ + test/correctness/routines/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${ROUTINES_AC}) + add_executable(test_${ROUTINE} + $ + $ + test/correctness/routines/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${ROUTINES}) target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES}) install(TARGETS test_${ROUTINE} DESTINATION bin) endforeach() diff --git a/test/correctness/testab.cc b/test/correctness/testab.cc new file mode 100644 index 00000000..ef03f32f --- /dev/null +++ b/test/correctness/testab.cc @@ -0,0 +1,192 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the TestAB class (see the header for information about the class). +// +// ================================================================================================= + +#include + +#include "correctness/testab.h" + +namespace clblast { +// ================================================================================================= + +// Constructor, initializes the base class tester and input data +template +TestAB::TestAB(int argc, char *argv[], const bool silent, + const std::string &name, const std::vector &options, + const Routine clblast_lambda, const Routine clblas_lambda): + Tester{argc, argv, silent, name, options}, + clblast_lambda_(clblast_lambda), + clblas_lambda_(clblas_lambda) { + + // Computes the maximum sizes. This allows for a single set of input/output buffers. + auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); + auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); + auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end()); + + // Creates test input data + a_source_.resize(max_dim*max_ld + max_offset); + b_source_.resize(max_dim*max_ld + max_offset); + PopulateVector(a_source_); + PopulateVector(b_source_); +} + +// =============================================================================================== + +// Tests the routine for a wide variety of parameters +template +void TestAB::TestRegular(Arguments &args, const std::string &name) { + if (!PrecisionSupported()) { return; } + TestStart("regular behaviour", name); + + // Computes whether or not the matrices are transposed. Note that we assume a default of + // column-major and no-transpose. If one of them is different (but not both), then rotated + // is considered true. + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + auto b_rotated = (args.layout == Layout::kRowMajor); + + // Iterates over the matrix dimensions + for (auto &m: kMatrixDims) { + args.m = m; + for (auto &n: kMatrixDims) { + args.n = n; + + // Computes the second dimensions of the matrices taking the rotation into account + auto a_two = (a_rotated) ? n : n; + auto b_two = (b_rotated) ? m : n; + + // Iterates over the leading-dimension values and the offsets + for (auto &a_ld: kMatrixDims) { + args.a_ld = a_ld; + for (auto &a_offset: kOffsets) { + args.a_offset = a_offset; + for (auto &b_ld: kMatrixDims) { + args.b_ld = b_ld; + for (auto &b_offset: kOffsets) { + args.b_offset = b_offset; + + // Computes the buffer sizes + auto a_size = a_two * a_ld + a_offset; + auto b_size = b_two * b_ld + b_offset; + if (a_size < 1 || b_size < 1) { continue; } + + // Creates the OpenCL buffers + auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T)); + auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T)); + auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T)); + + // Iterates over the values for alpha and beta + for (auto &alpha: kAlphaValues) { + args.alpha = alpha; + for (auto &beta: kBetaValues) { + args.beta = beta; + + // Runs the reference clBLAS code + a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_); + r_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_); + auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_); + + // Runs the CLBlast code + a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_); + s_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_); + auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_); + + // Tests for equality of the two status codes + if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) { + TestErrorCodes(status1, status2, args); + continue; + } + + // Downloads the results + std::vector r_result(b_size, static_cast(0)); + std::vector s_result(b_size, static_cast(0)); + r_mat.ReadBuffer(queue_, b_size*sizeof(T), r_result); + s_mat.ReadBuffer(queue_, b_size*sizeof(T), s_result); + + // Checks for differences in the output + auto errors = size_t{0}; + for (auto idm=size_t{0}; idm +void TestAB::TestInvalidBufferSizes(Arguments &args, const std::string &name) { + if (!PrecisionSupported()) { return; } + TestStart("invalid buffer sizes", name); + + // Sets example test parameters + args.m = kBufferSize; + args.n = kBufferSize; + args.a_ld = kBufferSize; + args.b_ld = kBufferSize; + args.a_offset = 0; + args.b_offset = 0; + + // Iterates over test buffer sizes + const std::vector kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize}; + for (auto &a_size: kBufferSizes) { + for (auto &b_size: kBufferSizes) { + + // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly + // want to be able to create invalid buffers (no error checking here). + auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr); + auto a_mat = Buffer(a); + auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr); + auto r_mat = Buffer(r); + auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr); + auto s_mat = Buffer(s); + + // Runs the two routines + auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_); + auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_); + + // Tests for equality of the two status codes + TestErrorCodes(status1, status2, args); + } + } + TestEnd(); +} + +// ================================================================================================= + +// Compiles the templated class +template class TestAB; +template class TestAB; +template class TestAB; +template class TestAB; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/testab.h b/test/correctness/testab.h new file mode 100644 index 00000000..24a9db7c --- /dev/null +++ b/test/correctness/testab.h @@ -0,0 +1,85 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file tests any mat-mat (A,B) routine. It contains two types of tests: one testing +// all sorts of input combinations, and one deliberatly testing with invalid values. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_CORRECTNESS_TESTAB_H_ +#define CLBLAST_TEST_CORRECTNESS_TESTAB_H_ + +#include +#include + +#include "correctness/tester.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestAB: public Tester { + public: + + // Uses several variables from the Tester class + using Tester::context_; + using Tester::queue_; + using Tester::kLayouts; + using Tester::kTransposes; + + // Uses several helper functions from the Tester class + using Tester::TestStart; + using Tester::TestEnd; + using Tester::TestSimilarity; + using Tester::TestErrorCount; + using Tester::TestErrorCodes; + using Tester::GetExampleScalars; + using Tester::GetOffsets; + using Tester::PrecisionSupported; + + // Test settings for the regular test. Append to this list in case more tests are required. + const std::vector kMatrixDims = { 7, 64 }; + const std::vector kOffsets = GetOffsets(); + const std::vector kAlphaValues = GetExampleScalars(); + const std::vector kBetaValues = GetExampleScalars(); + + // Test settings for the invalid test + const size_t kBufferSize = 64; + + // Shorthand for a BLAS routine + using Routine = std::function&, + const Buffer&, const Buffer&, + CommandQueue&)>; + + // Constructor, initializes the base class tester and input data + TestAB(int argc, char *argv[], const bool silent, + const std::string &name, const std::vector &options, + const Routine clblast_lambda, const Routine clblas_lambda); + + // The test functions, taking no inputs + void TestRegular(Arguments &args, const std::string &name); + void TestInvalidBufferSizes(Arguments &args, const std::string &name); + + private: + + // Source data to test with + std::vector a_source_; + std::vector b_source_; + + // The routines to test + Routine clblast_lambda_; + Routine clblas_lambda_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_CORRECTNESS_TESTAB_H_ +#endif diff --git a/test/correctness/testac.cc b/test/correctness/testac.cc new file mode 100644 index 00000000..e16186d9 --- /dev/null +++ b/test/correctness/testac.cc @@ -0,0 +1,191 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the TestAC class (see the header for information about the class). +// +// ================================================================================================= + +#include + +#include "correctness/testac.h" + +namespace clblast { +// ================================================================================================= + +// Constructor, initializes the base class tester and input data +template +TestAC::TestAC(int argc, char *argv[], const bool silent, + const std::string &name, const std::vector &options, + const Routine clblast_lambda, const Routine clblas_lambda): + Tester{argc, argv, silent, name, options}, + clblast_lambda_(clblast_lambda), + clblas_lambda_(clblas_lambda) { + + // Computes the maximum sizes. This allows for a single set of input/output buffers. + auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); + auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); + auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end()); + + // Creates test input data + a_source_.resize(max_dim*max_ld + max_offset); + c_source_.resize(max_dim*max_ld + max_offset); + PopulateVector(a_source_); + PopulateVector(c_source_); +} + +// =============================================================================================== + +// Tests the routine for a wide variety of parameters +template +void TestAC::TestRegular(Arguments &args, const std::string &name) { + if (!PrecisionSupported()) { return; } + TestStart("regular behaviour", name); + + // Computes whether or not the matrices are transposed. Note that we assume a default of + // column-major and no-transpose. If one of them is different (but not both), then rotated + // is considered true. + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + auto c_rotated = (args.layout == Layout::kRowMajor); + + // Iterates over the matrix dimensions + for (auto &n: kMatrixDims) { + args.n = n; + for (auto &k: kMatrixDims) { + args.k = k; + + // Computes the second dimensions of the matrices taking the rotation into account + auto a_two = (a_rotated) ? n : k; + auto c_two = (c_rotated) ? n : n; + + // Iterates over the leading-dimension values and the offsets + for (auto &a_ld: kMatrixDims) { + args.a_ld = a_ld; + for (auto &a_offset: kOffsets) { + args.a_offset = a_offset; + for (auto &c_ld: kMatrixDims) { + args.c_ld = c_ld; + for (auto &c_offset: kOffsets) { + args.c_offset = c_offset; + + // Computes the buffer sizes + auto a_size = a_two * a_ld + a_offset; + auto c_size = c_two * c_ld + c_offset; + if (a_size < 1 || c_size < 1) { continue; } + + // Creates the OpenCL buffers + auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T)); + auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T)); + auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T)); + + // Iterates over the values for alpha and beta + for (auto &alpha: kAlphaValues) { + args.alpha = alpha; + for (auto &beta: kBetaValues) { + args.beta = beta; + + // Runs the reference clBLAS code + a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_); + r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_); + auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_); + + // Runs the CLBlast code + a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_); + s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_); + auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_); + + // Tests for equality of the two status codes + if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) { + TestErrorCodes(status1, status2, args); + continue; + } + + // Downloads the results + std::vector r_result(c_size, static_cast(0)); + std::vector s_result(c_size, static_cast(0)); + r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result); + s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result); + + // Checks for differences in the output + auto errors = size_t{0}; + for (auto idn0=size_t{0}; idn0 +void TestAC::TestInvalidBufferSizes(Arguments &args, const std::string &name) { + if (!PrecisionSupported()) { return; } + TestStart("invalid buffer sizes", name); + + // Sets example test parameters + args.m = kBufferSize; + args.n = kBufferSize; + args.k = kBufferSize; + args.a_ld = kBufferSize; + args.c_ld = kBufferSize; + args.a_offset = 0; + args.c_offset = 0; + + // Iterates over test buffer sizes + const std::vector kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize}; + for (auto &a_size: kBufferSizes) { + for (auto &c_size: kBufferSizes) { + + // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly + // want to be able to create invalid buffers (no error checking here). + auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr); + auto a_mat = Buffer(a); + auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr); + auto r_mat = Buffer(r); + auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr); + auto s_mat = Buffer(s); + + // Runs the two routines + auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_); + auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_); + + // Tests for equality of the two status codes + TestErrorCodes(status1, status2, args); + } + } + TestEnd(); +} + +// ================================================================================================= + +// Compiles the templated class +template class TestAC; +template class TestAC; +template class TestAC; +template class TestAC; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/testac.h b/test/correctness/testac.h new file mode 100644 index 00000000..4ab21e23 --- /dev/null +++ b/test/correctness/testac.h @@ -0,0 +1,85 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file tests any mat-mat (A,C) routine. It contains two types of tests: one testing +// all sorts of input combinations, and one deliberatly testing with invalid values. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_CORRECTNESS_TESTAC_H_ +#define CLBLAST_TEST_CORRECTNESS_TESTAC_H_ + +#include +#include + +#include "correctness/tester.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestAC: public Tester { + public: + + // Uses several variables from the Tester class + using Tester::context_; + using Tester::queue_; + using Tester::kLayouts; + using Tester::kTransposes; + + // Uses several helper functions from the Tester class + using Tester::TestStart; + using Tester::TestEnd; + using Tester::TestSimilarity; + using Tester::TestErrorCount; + using Tester::TestErrorCodes; + using Tester::GetExampleScalars; + using Tester::GetOffsets; + using Tester::PrecisionSupported; + + // Test settings for the regular test. Append to this list in case more tests are required. + const std::vector kMatrixDims = { 7, 64 }; + const std::vector kOffsets = GetOffsets(); + const std::vector kAlphaValues = GetExampleScalars(); + const std::vector kBetaValues = GetExampleScalars(); + + // Test settings for the invalid test + const size_t kBufferSize = 64; + + // Shorthand for a BLAS routine + using Routine = std::function&, + const Buffer&, const Buffer&, + CommandQueue&)>; + + // Constructor, initializes the base class tester and input data + TestAC(int argc, char *argv[], const bool silent, + const std::string &name, const std::vector &options, + const Routine clblast_lambda, const Routine clblas_lambda); + + // The test functions, taking no inputs + void TestRegular(Arguments &args, const std::string &name); + void TestInvalidBufferSizes(Arguments &args, const std::string &name); + + private: + + // Source data to test with + std::vector a_source_; + std::vector c_source_; + + // The routines to test + Routine clblast_lambda_; + Routine clblas_lambda_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_CORRECTNESS_TESTAC_H_ +#endif From 20eb3506d63e21725974e16ae392cf0dd4bf4df5 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Tue, 23 Jun 2015 08:09:07 +0200 Subject: [PATCH 04/15] Added a condition to update only lower/upper triangular parts in the un-pad kernels --- include/internal/routine.h | 3 ++- src/kernels/pad.opencl | 14 +++++++++++--- src/kernels/padtranspose.opencl | 18 +++++++++++++----- src/routine.cc | 10 ++++++++-- src/routines/xgemm.cc | 8 ++++---- 5 files changed, 38 insertions(+), 15 deletions(-) diff --git a/include/internal/routine.h b/include/internal/routine.h index a65ced20..d2cee52c 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -92,7 +92,8 @@ class Routine { const size_t dest_ld, const size_t dest_offset, const Buffer &dest, const bool do_transpose, const bool do_conjugate, - const bool pad, const Program &program); + const bool pad, const bool upper, const bool lower, + const Program &program); // Queries the cache and retrieve either a matching program or a boolean whether a match exists. // The first assumes that the program is available in the cache and will throw an exception diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl index 45eaef91..cce0c746 100644 --- a/src/kernels/pad.opencl +++ b/src/kernels/pad.opencl @@ -86,7 +86,8 @@ __kernel void UnPadMatrix(const int src_one, const int src_two, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, - __global real* dest) { + __global real* dest, + const int upper, const int lower) { // Loops over the work per thread in both dimensions #pragma unroll @@ -95,11 +96,18 @@ __kernel void UnPadMatrix(const int src_one, const int src_two, #pragma unroll for (int w_two=0; w_two= id_one); } + else if (lower == 1) { condition = (id_two <= id_one); } + if (condition) { // Copies the value into the destination matrix. This is always within bounds of the source // matrix, as we know that the destination matrix is smaller than the source. - dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset]; + if (id_two < dest_two && id_one < dest_one) { + dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset]; + } } } } diff --git a/src/kernels/padtranspose.opencl b/src/kernels/padtranspose.opencl index 2f2aabd6..7e923392 100644 --- a/src/kernels/padtranspose.opencl +++ b/src/kernels/padtranspose.opencl @@ -100,7 +100,8 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, - __global real* dest) { + __global real* dest, + const int upper, const int lower) { // Local memory to store a tile of the matrix (for coalescing) __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; @@ -137,10 +138,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two, const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0); const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1); - // Stores the transposed value in the destination matrix - if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { - real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; - dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + // Masking in case of triangular matrices: updates only the upper or lower part + bool condition = true; + if (upper == 1) { condition = (id_dest_one >= id_dest_two); } + else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } + if (condition) { + + // Stores the transposed value in the destination matrix + if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { + real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; + dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + } } } } diff --git a/src/routine.cc b/src/routine.cc index a4e0bb37..4b7ece41 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -210,11 +210,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr const size_t dest_ld, const size_t dest_offset, const Buffer &dest, const bool do_transpose, const bool do_conjugate, - const bool pad, const Program &program) { + const bool pad, const bool upper, const bool lower, + const Program &program) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && - (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld); + (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && + (upper == false) && (lower == false); // Determines the right kernel auto kernel_name = std::string{}; @@ -267,6 +269,10 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr if (pad) { kernel.SetArgument(10, static_cast(do_conjugate)); } + else { + kernel.SetArgument(10, static_cast(upper)); + kernel.SetArgument(11, static_cast(lower)); + } } // Launches the kernel and returns the error code. Uses global and local thread sizes based on diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc index 20cd2675..651ebb55 100644 --- a/src/routines/xgemm.cc +++ b/src/routines/xgemm.cc @@ -108,18 +108,18 @@ StatusCode Xgemm::DoGemm(const Layout layout, // them up until they reach a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, m_ceiled, k_ceiled, m_ceiled, 0, temp_a, - a_do_transpose, a_conjugate, true, program); + a_do_transpose, a_conjugate, true, false, false, program); if (ErrorIn(status)) { return status; } status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - b_do_transpose, b_conjugate, true, program); + b_do_transpose, b_conjugate, true, false, false, program); if (ErrorIn(status)) { return status; } // Only necessary for matrix C if it used both as input and output if (beta != static_cast(0)) { status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, m_ceiled, n_ceiled, m_ceiled, 0, temp_c, - c_do_transpose, false, true, program); + c_do_transpose, false, true, false, false, program); if (ErrorIn(status)) { return status; } } @@ -151,7 +151,7 @@ StatusCode Xgemm::DoGemm(const Layout layout, // Runs the post-processing kernel status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c, c_one, c_two, c_ld, c_offset, c_buffer, - c_do_transpose, false, false, program); + c_do_transpose, false, false, false, false, program); if (ErrorIn(status)) { return status; } // Successfully finished the computation From 0a3831e6d1eb437a9ef9ac7570f9a554b2c35edb Mon Sep 17 00:00:00 2001 From: CNugteren Date: Tue, 23 Jun 2015 08:09:46 +0200 Subject: [PATCH 05/15] Updated bandwidth computation for GEMM and SYMM --- test/performance/routines/xgemm.cc | 2 +- test/performance/routines/xsymm.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/performance/routines/xgemm.cc b/test/performance/routines/xgemm.cc index adff85c8..97e19b44 100644 --- a/test/performance/routines/xgemm.cc +++ b/test/performance/routines/xgemm.cc @@ -72,7 +72,7 @@ void PerformanceXgemm(const Arguments &args, // Prints the performance of both libraries const auto flops = 2 * args.m * args.n * args.k; - const auto bytes = (args.m*args.k + args.k*args.n + args.m*args.n) * sizeof(T); + const auto bytes = (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T); const auto output_ints = std::vector{args.m, args.n, args.k, static_cast(args.layout), static_cast(args.a_transpose), diff --git a/test/performance/routines/xsymm.cc b/test/performance/routines/xsymm.cc index 3c9feb05..0b1d75a5 100644 --- a/test/performance/routines/xsymm.cc +++ b/test/performance/routines/xsymm.cc @@ -72,7 +72,7 @@ void PerformanceXsymm(const Arguments &args, // Prints the performance of both libraries const auto flops = 2 * args.m * args.n * args.m; - const auto bytes = (args.m*args.m + args.m*args.n + args.m*args.n) * sizeof(T); + const auto bytes = (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); const auto output_ints = std::vector{args.m, args.n, static_cast(args.layout), static_cast(args.triangle), From 9fc38cdf5ed44ef41cf3d6cf9e7c32585447c042 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Tue, 23 Jun 2015 17:58:51 +0200 Subject: [PATCH 06/15] Added a lower/upper triangular version of the GEMM kernel --- src/kernels/xgemm.opencl | 379 +++++++++++++++++++++++++-------------- 1 file changed, 244 insertions(+), 135 deletions(-) diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl index a4f45e90..4c7ae064 100644 --- a/src/kernels/xgemm.opencl +++ b/src/kernels/xgemm.opencl @@ -127,6 +127,55 @@ R"( // ================================================================================================= +// Initializes the accumulation registers to zero +inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { + #pragma unroll + for (int mi=0; mi get_group_id(0)*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeN, alpha, beta); +} + +// ================================================================================================= + // End of the C++11 raw string literal )"; From a17297937d757d9747adde600f832d1e0c2753c1 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Tue, 23 Jun 2015 22:31:27 +0200 Subject: [PATCH 07/15] Added performance-client for AC routines --- test/performance/client.cc | 81 ++++++++++++++++++++++++++++++++++++-- test/performance/client.h | 3 ++ 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/test/performance/client.cc b/test/performance/client.cc index 3b07970c..65ff3218 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -115,7 +115,7 @@ void ClientAXY(int argc, char *argv[], Routine3 client_routine, auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n; // Computes the vector sizes in case the matrix is transposed - auto a_transposed = (args.a_transpose == Transpose::kYes); + auto a_transposed = (args.a_transpose != Transpose::kNo); auto m_real = (a_transposed) ? args.n : args.m; auto n_real = (a_transposed) ? args.m : args.n; @@ -163,10 +163,83 @@ template void ClientAXY(int, char **, Routine3, const std::vec // ================================================================================================= +// This is the matrix-matrix variant of the set-up/tear-down client routine. +template +void ClientAC(int argc, char *argv[], Routine2 client_routine, + const std::vector &options) { + + // Function to determine how to find the default value of the leading dimension of matrix A + auto default_ld_a = [](const Arguments args) { return args.k; }; + + // Simple command line argument parser with defaults + auto args = ParseArguments(argc, argv, options, default_ld_a); + if (args.print_help) { return; } + + // Prints the header of the output table + PrintTableHeader(args.silent, options); + + // Initializes OpenCL and the libraries + auto platform = Platform(args.platform_id); + auto device = Device(platform, kDeviceType, args.device_id); + auto context = Context(device); + auto queue = CommandQueue(context, device); + if (args.compare_clblas) { clblasSetup(); } + + // Computes whether or not the matrices are transposed. Note that we assume a default of + // column-major and no-transpose. If one of them is different (but not both), then rotated + // is considered true. + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + + // Iterates over all "num_step" values jumping by "step" each time + auto s = size_t{0}; + while(true) { + + // Computes the data sizes + auto a_two = (a_rotated) ? args.n : args.k; + auto a_size = a_two * args.a_ld + args.a_offset; + auto c_size = args.n * args.c_ld + args.c_offset; + + // Populates input host matrices with random data + std::vector a_source(a_size); + std::vector c_source(c_size); + PopulateVector(a_source); + PopulateVector(c_source); + + // Creates the matrices on the device + auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T)); + auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T)); + a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source); + c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source); + + // Runs the routine-specific code + client_routine(args, a_buffer, c_buffer, queue); + + // Makes the jump to the next step + ++s; + if (s >= args.num_steps) { break; } + args.n += args.step; + args.k += args.step; + args.a_ld += args.step; + args.c_ld += args.step; + } + + // Cleans-up and returns + if (args.compare_clblas) { clblasTeardown(); } +} + +// Compiles the above function +template void ClientAC(int, char **, Routine2, const std::vector&); +template void ClientAC(int, char **, Routine2, const std::vector&); +template void ClientAC(int, char **, Routine2, const std::vector&); +template void ClientAC(int, char **, Routine2, const std::vector&); + +// ================================================================================================= + // This is the matrix-matrix-matrix variant of the set-up/tear-down client routine. template void ClientABC(int argc, char *argv[], Routine3 client_routine, - const std::vector &options) { + const std::vector &options) { // Function to determine how to find the default value of the leading dimension of matrix A auto default_ld_a = [](const Arguments args) { return args.m; }; @@ -188,9 +261,9 @@ void ClientABC(int argc, char *argv[], Routine3 client_routine, // Computes whether or not the matrices are transposed. Note that we assume a default of // column-major and no-transpose. If one of them is different (but not both), then rotated // is considered true. - auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) || + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); - auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) || + auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); auto c_rotated = (args.layout == Layout::kRowMajor); diff --git a/test/performance/client.h b/test/performance/client.h index 5125844a..edcd1b68 100644 --- a/test/performance/client.h +++ b/test/performance/client.h @@ -52,6 +52,9 @@ template void ClientAXY(int argc, char *argv[], Routine3 client_routine, const std::vector &options); template +void ClientAC(int argc, char *argv[], Routine2 client_routine, + const std::vector &options); +template void ClientABC(int argc, char *argv[], Routine3 client_routine, const std::vector &options); From 60a88aac8672d360eb05ba25b1c4ffbf53a78dff Mon Sep 17 00:00:00 2001 From: CNugteren Date: Wed, 24 Jun 2015 07:50:18 +0200 Subject: [PATCH 08/15] Added the SYRK routine, tester, and client --- CMakeLists.txt | 2 +- include/internal/routines/xsyrk.h | 49 ++++++++++ src/clblast.cc | 4 +- src/routines/xsyrk.cc | 147 +++++++++++++++++++++++++++++ test/correctness/routines/xsyrk.cc | 96 +++++++++++++++++++ test/performance/graphs/xsyrk.r | 94 ++++++++++++++++++ test/performance/routines/xsyrk.cc | 113 ++++++++++++++++++++++ 7 files changed, 501 insertions(+), 4 deletions(-) create mode 100644 include/internal/routines/xsyrk.h create mode 100644 src/routines/xsyrk.cc create mode 100644 test/correctness/routines/xsyrk.cc create mode 100644 test/performance/graphs/xsyrk.r create mode 100644 test/performance/routines/xsyrk.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index b84ed62b..a8e756e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,7 +99,7 @@ set(ROUTINES_XY xaxpy) set(ROUTINES_AXY xgemv) set(ROUTINES_ABC xgemm xsymm) set(ROUTINES_AB ) -set(ROUTINES_AC ) +set(ROUTINES_AC xsyrk) set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC} ${ROUTINES_AB} ${ROUTINES_AC}) # ================================================================================================== diff --git a/include/internal/routines/xsyrk.h b/include/internal/routines/xsyrk.h new file mode 100644 index 00000000..3dab731f --- /dev/null +++ b/include/internal/routines/xsyrk.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk routine. The precision is implemented using a template argument. +// The implementation is based on the regular Xgemm routine and kernel, but with two main changes: +// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part. +// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for +// performance reasons, as the actual masking is done later (see the first point). +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYRK_H_ +#define CLBLAST_ROUTINES_XSYRK_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsyrk: public Routine { + public: + Xsyrk(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYRK_H_ +#endif diff --git a/src/clblast.cc b/src/clblast.cc index e0d085a9..13dfb50f 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -26,6 +26,7 @@ // BLAS level-3 includes #include "internal/routines/xgemm.h" #include "internal/routines/xsymm.h" +#include "internal/routines/xsyrk.h" namespace clblast { // ================================================================================================= @@ -254,7 +255,6 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose tr cl_command_queue* queue, cl_event* event) { auto queue_cpp = CommandQueue(*queue); auto event_cpp = Event(*event); - /* auto routine = Xsyrk(queue_cpp, event_cpp); // Loads the kernel source-code as an include (C++11 raw string literal) @@ -276,8 +276,6 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose tr return routine.DoSyrk(layout, triangle, transpose_a, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, beta, Buffer(c_buffer), c_offset, c_ld); - */ - return StatusCode::kSuccess; } template StatusCode Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, diff --git a/src/routines/xsyrk.cc b/src/routines/xsyrk.cc new file mode 100644 index 00000000..1f645fd5 --- /dev/null +++ b/src/routines/xsyrk.cc @@ -0,0 +1,147 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/xsyrk.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyrk::precision_ = Precision::kSingle; +template <> const Precision Xsyrk::precision_ = Precision::kDouble; +template <> const Precision Xsyrk::precision_ = Precision::kComplexSingle; +template <> const Precision Xsyrk::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyrk::Xsyrk(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // In case of complex data-types, the transpose can also become a conjugate transpose + auto a_conjugate = (a_transpose == Transpose::kConjugate); + + // Computes the first and second dimensions of the A matrix taking the layout into account + auto a_one = (a_rotated) ? k : n; + auto a_two = (a_rotated) ? n : k; + + // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernel. This transposes the matrices A and B, but also pads zeros to + // fill them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + a_rotated, a_conjugate, true, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_a()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/routines/xsyrk.cc b/test/correctness/routines/xsyrk.cc new file mode 100644 index 00000000..8d3bd82e --- /dev/null +++ b/test/correctness/routines/xsyrk.cc @@ -0,0 +1,96 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xsyrk routine. It is based on the TestAC class. +// +// ================================================================================================= + +#include "wrapper_clblas.h" +#include "correctness/testac.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison. +template +void XsyrkTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates the CLBlast lambda + auto clblast_lambda = [](const Arguments &args, + const Buffer &a_mat, const Buffer &c_mat, + CommandQueue &queue) -> StatusCode { + auto queue_plain = queue(); + auto event = cl_event{}; + return Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + }; + + // Creates the clBLAS lambda (for comparison) + auto clblas_lambda = [](const Arguments &args, + const Buffer &a_mat, const Buffer &c_mat, + CommandQueue &queue) -> StatusCode { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyrk(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + return static_cast(status); + }; + + // Initializes the arguments relevant for this routine + auto args = Arguments{}; + const auto options = std::vector{kArgN, kArgK, kArgLayout, + kArgTriangle, kArgATransp, + kArgALeadDim, kArgCLeadDim, + kArgAOffset, kArgCOffset}; + + // Creates a tester + TestAC tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { + args.layout = layout; + for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) { + args.triangle = triangle; + for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is + args.a_transpose = a_transpose; // not supported by clBLAS + const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose); + + // Runs the tests + tester.TestRegular(args, case_name); + tester.TestInvalidBufferSizes(args, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::XsyrkTest(argc, argv, false, "SSYRK"); + clblast::XsyrkTest(argc, argv, true, "DSYRK"); + clblast::XsyrkTest(argc, argv, true, "CSYRK"); + clblast::XsyrkTest(argc, argv, true, "ZSYRK"); + return 0; +} + +// ================================================================================================= diff --git a/test/performance/graphs/xsyrk.r b/test/performance/graphs/xsyrk.r new file mode 100644 index 00000000..fe8598e9 --- /dev/null +++ b/test/performance/graphs/xsyrk.r @@ -0,0 +1,94 @@ + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project uses a tab-size of two spaces and a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# This file implements the performance script for the Xsyrk routine +# +# ================================================================================================== + +# Includes the common functions +args <- commandArgs(trailingOnly = FALSE) +thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)]))) +source(file.path(dirname(thisfile), "common.r")) + +# ================================================================================================== + +# Settings +routine_name <- "xsyrk" +parameters <- c("-n","-k","-layout","-triangle","-transA", + "-num_steps","-step","-runs","-precision") +precision <- 32 + +# Sets the names of the test-cases +test_names <- list( + "multiples of 128", + "multiples of 128 (+1)", + "around n=k=512", + "around n=k=2048", + "layouts and transposing (n=k=1024)", + "powers of 2" +) + +# Defines the test-cases +test_values <- list( + list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)), + list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)), + list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)), + list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)), + list( + c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), + c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), + c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), + c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + ), + list( + c(8, 8, 0, 0, 0, 1, 0, num_runs, precision), + c(16, 16, 0, 0, 0, 1, 0, num_runs, precision), + c(32, 32, 0, 0, 0, 1, 0, num_runs, precision), + c(64, 64, 0, 0, 0, 1, 0, num_runs, precision), + c(128, 128, 0, 0, 0, 1, 0, num_runs, precision), + c(256, 256, 0, 0, 0, 1, 0, num_runs, precision), + c(512, 512, 0, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), + c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision), + c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision), + c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision) + ) +) + +# Defines the x-labels corresponding to the test-cases +test_xlabels <- list( + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "layout (row/col), triangle (u/l), transA (n/y)", + "matrix sizes (n=k)" +) + +# Defines the x-axis of the test-cases +test_xaxis <- list( + c("n", ""), + c("n", ""), + c("n", ""), + c("n", ""), + list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y", + "col,u,n", "col,u,y", "col,l,n", "col,l,y")), + c("n", "x") +) + +# ================================================================================================== + +# Start the script +main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values, + test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE) + +# ================================================================================================== \ No newline at end of file diff --git a/test/performance/routines/xsyrk.cc b/test/performance/routines/xsyrk.cc new file mode 100644 index 00000000..f36d665a --- /dev/null +++ b/test/performance/routines/xsyrk.cc @@ -0,0 +1,113 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk command-line interface tester. +// +// ================================================================================================= + +#include +#include +#include + +#include "wrapper_clblas.h" +#include "performance/client.h" + +namespace clblast { +// ================================================================================================= + +// The client, used for performance testing. It contains the function calls to CLBlast and to other +// libraries to compare against. +template +void PerformanceXsyrk(const Arguments &args, + const Buffer &a_mat, const Buffer &c_mat, + CommandQueue &queue) { + + // Creates the CLBlast lambda + auto clblast_lambda = [&args, &a_mat, &c_mat, &queue]() { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + if (status != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast error: "+ToString(static_cast(status))); + } + }; + + // Creates the clBLAS lambda (for comparison) + auto clblas_lambda = [&args, &a_mat, &c_mat, &queue]() { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyrk(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + if (status != CL_SUCCESS) { + throw std::runtime_error("clBLAS error: "+ToString(static_cast(status))); + } + }; + + // Runs the routines and collect the timings + auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda); + auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda); + + // Prints the performance of both libraries + const auto flops = args.n * args.n * args.k; + const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T); + const auto output_ints = std::vector{args.n, args.k, + static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.a_ld, args.c_ld, + args.a_offset, args.c_offset}; + const auto output_strings = std::vector{ToString(args.alpha), + ToString(args.beta)}; + PrintTableRow(output_ints, output_strings, args.no_abbrv, + ms_clblast, ms_clblas, flops, bytes); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void ClientXsyrk(int argc, char *argv[]) { + const auto o = std::vector{kArgN, kArgK, + kArgLayout, kArgTriangle, kArgATransp, + kArgALeadDim, kArgCLeadDim, + kArgAOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: ClientAC(argc, argv, PerformanceXsyrk, o); break; + case Precision::kDouble: ClientAC(argc, argv, PerformanceXsyrk, o); break; + case Precision::kComplexSingle: ClientAC(argc, argv, PerformanceXsyrk, o); break; + case Precision::kComplexDouble: ClientAC(argc, argv, PerformanceXsyrk, o); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::ClientXsyrk(argc, argv); + return 0; +} + +// ================================================================================================= From 3de4471afe9fd08e4c4f7c77781cc4ae9c59af21 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Wed, 24 Jun 2015 07:52:19 +0200 Subject: [PATCH 09/15] Added the SYRK routine --- CHANGELOG | 4 ++++ README.md | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 12d9322e..25268610 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,8 @@ +Development version (next release) +- Added level-3 routines: + * SSYRK/DSYRK/CSYRK/ZSYRK + Version 0.2.0 - Added support for complex conjugate transpose - Several host-code performance improvements diff --git a/README.md b/README.md index 1bed1146..72a13762 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ CLBlast is in active development and currently does not support the full set of | xGEMM |`x`|`x`|`x`|`x`| | | xSYMM |`x`|`x`|`x`|`x`| | | xHEMM | - | - | | | | -| xSYRK | | | | | | +| xSYRK |`x`|`x`|`x`|`x`| | | xHERK | - | - | | | | | xSYR2K | | | | | | | xHER2K | - | - | | | | From 96e40123490404267190d4a91ca35298189ff95b Mon Sep 17 00:00:00 2001 From: CNugteren Date: Thu, 25 Jun 2015 19:19:31 +0200 Subject: [PATCH 10/15] Added SSYRK performance graphs --- doc/performance/Iris/SSYRK.pdf | Bin 0 -> 13032 bytes doc/performance/Tesla_K40m/SSYRK.pdf | Bin 0 -> 13261 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 doc/performance/Iris/SSYRK.pdf create mode 100644 doc/performance/Tesla_K40m/SSYRK.pdf diff --git a/doc/performance/Iris/SSYRK.pdf b/doc/performance/Iris/SSYRK.pdf new file mode 100644 index 0000000000000000000000000000000000000000..26cba3852f8af001f81f7bd4f90b4125231ee329 GIT binary patch literal 13032 zcmb8W1yq|sw=NvC1qu|`LMT!^xD_Z?oZ{|;;1DDf2=2v;Q`~8x6bZ$(6ez{LxO;Jz z!XNs{Ip;g~ziZvB^=2}A_MSa6fqC|xJj19aBg?_f$%oAtxDvP;I21T$V1rbkmjAzPTDC9;q?HCx z&eh!6#un@bRCR|ryTg!XYOYXAcQC~Dp_P}Do0AtQ`|At(AY#E57srM;Sw4({=ij3N zg0#GxAwZC{In3MvY7GRbnOj5LfV@Zr4d8zn2>vgF2fYVvSBMkrpXwlKs1poXbps0h zH3w-x+@S8RU}S%M|Iy!r!oQY78RBFOvjGYT^9ut(vbGK|rkfqfm){Jw{mW&8lY9iX&uzoc-dzP^7lWdO>Hxsn`lt7!M%?XcNq6DHN87I zCX^Zy_d1(3^?Q4DwK)Xb-6s9~+TE**to!YiwIoDU2wL9m z-HG2w<=o>}(0JYH`g>mePBcZQ90A?ImR;OVXrGPS=iL-Jfm?izoW1ru5&m~U zFZ~Sr)Ld|?n(U9}cygeWUr2(VR+Kwy9_%~L?oRYU{^|#s@lZLF$dr!;u zZ>a?Bi#b(HY%#iIV%b}^2)AN&q+bH6rE3P^4GGhXuH@7-2nJb}ODQ{BI<$Dnl2!5?rFedQb& z*3uv_n9KseS-dXeN&6ZtzPa|2NiAice@84r+k0p!*=G6Qd+*agd4wnfjY@> zaeK9SkNmSUr7;;M0S)<+FZ!RWibBqt_7)r#)qcDX~1djbqr-e_XEVGL=bE&RfSq z0>)%#dQ$kZqy*QsgidD8B7ZQ;n*Hsts6DzB! zvT?Lqzy%AZMJBuT@?{;ern}0(5gV0RmV!O>?`yHnrs5kTxp50IN5R2(&#TJYXBC1; zL zFmwYpF(#EdqRUpT1)F^|u(BpU-|zWxx%KJ7(=^zV`rEVz2jte&a+C{1XCtg2>Aapo zg{IHhw()|LzQdAIDNp~%_6X(Erzyt09{Z`NT=vktm{4rcxf9xgv>;r0W}&F};6Pwl zXsn9fe0@n@YK^}fIg)SVlu9PH`)I4tq3PVoAMxS%HBD?{q$=L@7v{%`ZnS)ZFwYKX zXin&YG`1{$LDe|gtEeBU={N1cy~jg7w5K%9ym0nLut79q1>f~Mfas7T;q^OD{DG~i zt=Mf5xA-e4^2`ay#lJq~X#P)gK?nGxc zDUt(%w|@gxmZoV1uY= zf5naz^U7cTNXIqT!NK_qjOThr{Mm*x!AzZ80iFz$&3I4#D&c~Na9jWI_B4zV4L7@p zH)|KYENyK&GALQRMVup;1ki@sMf{m(tQ6p2=Kj+5Z6)jNWIsT1mJk|;snh=~vCr-Y z%IC;^d%JjGa&b6@&T9|h)zPxb^T=1HsjDcqHWcY3!Wf6f3py%^Kc;~O0YTE9A2e{Aqw;Tg{dBr6*u5|$v1aC9~l%Jnhj zzS?;5%YO0iPSw|HmFd3B_ru{q_+JABtW!vprKbduou&Eg@Gb@=TZf)@IB)}AFaASj znnDO(l8!1R#$>A!RT(5_Z ztWW@npIdSVYhUy4Yj#wfpR`e?DLe&B!MLckCJ50Yj5kjnWEEy%;xw<=Li_s&A9w25 zEHr11{EDOr=de$0*2^i+8Xn!}=BH{Da}RGR<%N;S(i9>85BN7G%#UUJ5=CoY3}&y# zUd~_rR1}FDX-=41Uh8S?emD5_z*)QpQ?*H6+2{Nl!S!a_m4naO&du1Q^C_djaD zPqdzGbf(jKc=F}GSZL(C_R?evTrXEG$aWeCdq-6wE52yZkMpQRR& z7%SVx)|dupm|VoyhnGRMAz`Si@KN0pqGo)%8+T)KBm5$oCjqdPV1pmLicE+)l($oQ z?QSS*-aKKb$D)dm)6C-K(e}F1v&Z zO$R^CzrU|zO1lw$PORcpF!Crvo-#v}_cMr#tuiBcn|pik<{zMu7s1G4`jVYglq`4z zJYxJ=RkIajuk~&A>Am5SxJGJueq>R*jh-sb*mEe~h2-iJB@IOj;Xr&q3VmKr{=_xU zdh&)i*C6b;#3fb;13}M8B}Qu8MajFqTfNktJrvay1;MTL(u)nDiLwI!(mPfxi2L0& z`%45WA!A)jjfF%t1Qk_Bl3@BxKn3O(z>vu+g8+wTI&gN8h(Tuom z_hy!jdrkJCPP3K0&w#UfG5kldbw00vUr|r~G*I{fFy$b#f~4@xquh&t0cj+}uyRJT zRQw&aV33iY-QbS|6rH&t$rswBwQjAuPlybGiF!C0ZofX z9g~~gvcE}*+naxV3d)l1`jR=_deXjOohi*dMv|Q=QwcdIeD%&2-D8j&-F_t7?MUA6a5(SByA{#uSDVaC>N-2z5@XC zn3W&-cmJtE(Xe*=NXeq1zN81}#*68V>su5@VW~8=Pv>_fb~{9`1a2(#Zch22bETV! z^9DrZ1vBZzQ3YW6v-=r+Ax10ge{mOXOWA8oT8@&2-1V)GdZB5ngI1fiAy(vEx9G->Ar955Ju zvzr&0*Y_B zuNpYpT2K?5Uf_>CmGM1HV+ojk%X{_V>o{7#(R0XCt+Dq`xR_=?X%lmHXHH4pJB+*m z6V`sN0rZm|t#wY=V#lJ6?nnC>r=>v|0uA--g3}>L@wL%ir;Hg|5kqSOs9*VtJ#`7$U zCtB+^wfVKBFK55mB^K?ij%+~f4TU^|${R0gUq7};u8j+~e>EeJYzP{3MM+M_7f*k4 zvb5FJkjw=~oR)mxCCC&m#4b$}i7}`b ztzItC3p@_URugJuYxMF)7dJdnet8L546SZI0>wV0Za4c6(yhNz1z4K*d0oXRlS>-D zz2Dq&7t7aqPnUg#L*|eZInbb3cG6B~4B3mDlTWDpHB-$(n;gd^29>XB#dFrxl7w|_#GsrF`Js9L`+F$&T-aYYq7sd_kncUno2zK($ zXDXeC|08K9Oc zDxWw|t#8eJd;i;9DZHbwOeX1vK?S8fQ3;l;+Sv+biTs9HXaBV5Df?qo#u)%vFH?Nh z{TE&aZ!DB7_0DpI?+A3^1er3zC4J>!sQi28-8W6Eyf39L8>8heelIPXQcJtG6Ql;y z%v0p3d=hmOwj-3=)F=~$rxY9)P9$+5H~0cK4{}347KfX?m1rVlJa3c3gbe3DzEgio zT6!x%^u{ff0p>Nzr|*3DHxJR1A3)9j6sN{vM~E_-D9crEZLk)zAx;QlimmP zk+#fIjcqZT8c<28vWP7$Q;rTT?kYP$r*p#S$`Ma_a2n-{tS&y`2Sl>TY?1lydJ4sv zkeZ)v_{{HTE@u2^@d+|FvI=D%Y6_VjAQPX`I`&Q_A#D7RCK3sPD#}Kh ztsIHEokD#Z9aj0W2Z05?1!pUXZs?#6+qm zZ;4Wg9vv97IQt1CJ{!chrlhZ^wR5-}8r)c*p-uds4_42EX(K~+A4 z?5Dm8MA5eT6ICt}w!|yu&;rVj)X@5=k^4M9&CkW2 zPC$f~(0S3P2&S&$fFc0XZPlfhk`keW2JJv6-%h*7IB9?iZS2uNI0A)yLz)ag#EYNh zZaGGV9J1bR_4+Y3@JOIXqBJa*@O8(-JoJ2OEoo8M?P$u=BQ?g`Nzp1ldVNi#eeE?3 zV2`}nG^b4c-opDAQG}&80cO0q{rxH_?8N{9L>H645}Bri#l4|5fGAi?VZ~7UZ+|bu z+XQbM45A+8s3dB3C5ltoTO*%~q@iMhC%?t19EzHTx@}HI(r(AZGo}&O4u|`1a|W6d zt{tN9?~u*s2kVF`yoC+Kh=tBkIZejILC@zcj1YZ{7F5U6@F0`!Nw~qX-XP9)rFrtM z2J$QswuIAg2^U_4>fQs_8#VlU?2I!xyrB7!S_8*?yaz*?@wIs&a-&+u0{T|^MSQqu za<#>EwcxKDt9HkBh63UN^)>Jr@aJpA6z6E& z5TG>e%x1bjEY+1Pe>(%EJFAhGgO{rf_1YOQ(;dONZjm3nS9^3v3Sz#dPv)I@oiyl- ziPW?fkfv%OOL3_9l$}BNq=yzk;zPMPuxapSclbIsNJ}GbZTPymOg75)nZcs55>j}p zHKt+Uz1!nE8dsO1_@KR+PCAP0|~z%Tyn^Z!~4#pk$nGG1|%E@o#EyAp~^wu zDsHi?F$Ol}B5hi{*ca495FP(%vpgEk`Fk`pJcrA+hATjFK7o z?n=(z(Tq9nUTA&s4){{&q_7%{-mPcfVS~j18lXw^uV->6-O!>_j>q;3= z2vWV0hc)2r1_KzHe!R3c+6hk^)9`DsBSsOd5v28Lj5SOx+Ke5dW`?agbn3{o;3t94 z&mIWxWGRClkotMKwtNyz)F5G`$odKP=0qt&8jnVpQt5kUv9bg+RwW2gvLL)nF{mXV z?#>`oW?Mg5hQC)|%6`;Po3Lith`wAZ|MasW;nUKAWE!o7t#Vc@P<5@M4GYp5J>iHL=S{i``mf35Oc7Wg#h};1O+QEzMR=%?~<^l5Nh%1|)>YT%;+@ zz1YWegRxwZkYNFm@^^Bu1ICI|on&;w$D1`af3+qoh1Ug%Hhk62@Gxpn_c0FJx3NR# zJG2!CDtS>x;wF^5I=hdKxttBSNCh6{myr(TmwXzG%`+LLE=!60rWkwoJa02d-C_Sb zzAJh3FWs^Xa?*wn%jFdtRJ9O@J4s;wZ;X!$9VLhRrq*-JDYP8V8vRl_&A$7hvP~5I zrrMO=0tw;q$mfrz82(y2i$_oY$mV4t%>>`pK9|nk;)BCFcZl0RM~nJoUq!6ed-LCZ z-&f8M^?Sc+7C4pETQ#84<0Wee)I<`HCW;a-g0X{ ztG+lXZiSVo#(vsd{pd?{3x?&cNK0*ZxyO`PLyDHk)mIu6CU%-&npq0UAqDCDQp~r@ z6V>MwT_k!-v#hb9b;hLFSRN_e--h4ZK2?@{-BOJW_Hq+uL&x%-?AkFz%|(qAz5g7^ zh%dY>SC3#K9T*g+m1r+iq!U2E$KQTj&^r?&_h3^kmeBAdOPi=P3xwjl4S8vAuAaCZ zqVx2qRS7R*M!{Kw(=Rag*qVhs`g=c}`qQe^++t#HENtO(!lb4)bd0AND0X&LEV<5L zH;IND2DQWY<=#5&guy>UD3=GXb)B6T0JsbAy|P;pD|}rB?Ob&v9@O0$VN*NDOlbcp zg}E5B^*{EQY-RPDV9*}6G4WZn+s~sF&Xl2(tw-!@?-529S*scuIhw&dpl zZ?nYJFU2vPImK<`eBs_LF04F3tN_?FL(&3c@sK2=c;=WSwq;B3l;keh-QQn4T;Qyb zXEAHY+|j<}RgUy5=HkK7=5VP|Q&Z=>&U!j0POqNfT`W~A1i@M6wJGLy9BSlq9uF#O zOQ13({KaH?GR^d=U~=~uo4MhLs4O=TwANkWcwxZY;N@`s$mDg&py8jd8%dk#s}77e z`-a@>e^xOZYg0F0T`j)=4b2`4#q#GjkuHwU$9cfWgq!dLJ|BB}tbS=KGn$-qd$ z*tEL;UjB`u;Af3SHvgBN^>4_y6*CX^gm-M*4_BXQzZWG&H8p=-h1dPeBR4c!l{OKkyM6@m(SUXRNwEbIIB;8?vB^t+Bw5wepf3mQXv6%JL@?;Qw+zQ zFA^nZMoubBTCXYkM{;g2@si`jYs4t|$()1y2WVo#ajz>s(4J4Mm!A6 zdRi zjjpQE=tXSnt%7`4Zn(0D$XCLT%_9kULmjf@^qf{%1auq7Q&&HNC|(e+rP54>D73Wpl_ zK081Y_|}a<4WVg?!>S0|m3(J~6{V*arCJYm{Zmb|tFhvJ-h3J) zn&_u2m70w_LV!|mG$wsiI5%JmlzorK^t*F7^u?UH(N0D8YISFi8$+4wxbCmGdaf|e zm+M18#I0o9)?)hazda!_&|P7X4z1-uKJcGvd5rY^YI!8j`_;@XMYQrgXVmh7tUeBl z5Fy`82Gq>wE(Ek}a3eefwDfN3$XP!xe4E83Nq0UXz$MW^&{6+Xb$E0;g?>AtmgnO+ zHyxFSiacm&=;h->ZUYg((5EFbdZ7ey7hf5~)99p>l$wL-q{{d_$@cFJKIz!D*VXUe z8+{VMJ62DB*`{k3lPX5ntV8>4aEwx=C?uE>q(`)YY_VDB- zL2+Fje+GWDnEJo)cOF@nJ2`)q;FXtqXJiPNy;|oCC_grTxk~R{MSZD0$D2TE-Fq&x zqhF|4{bqv){}($N-#)eV=PZVkbqBg9)U;~fRR%Ix!h2C|g(Wd%f8$|H+FqNj9B7>v zVjYL#l4hU7a1-ql=HhHHzFHrC z3-hA6C46Thdc!vKNd%9j@_fQYU)CO4D8RMc;=5#rfe^DmuRB*X@FrQ9RidG39o5mL zX?V|z1KKU2R?$;%TD;t|1evl*ePPKm4W&-(&v(plHu)*Y$~fG_{AOoOU`W_PDpo=E zk4P$Zk2Zs@+kyd6XJBzX{{W#@wY^3jHdXK5@^$i3ME^2gGSN?D#Pww*C;Qm_s;nUl z6*cjzjzuE=Jn|RHy52L(8Pe`BR&2o7aKnaUpC19+@J}aPp z`{WPd!VI^@;@c-*Mx9>D+NXZ*K>Xs~0xBZD=&kVRS{3T|23E;`F5FjpY`sSgv-r-^ zyb~))g`S^fVJh5PLk}E_J4weWp?UtBpZ<8%4*n|vxoOBr0r~%>{$C`c2hoFTLy&^Bw3N9U#1bg*pk@3o+R{VqU(=O* z;0ZZBI04B(-r9m851oSEAgM^8e-es#_=N@jkA$L!G5woRl$GM4r8!L+KGY)EqL%P9 zGHKY}MgvtIw?#glla(9msbc$kz;gi^EoT|^_~OSr%u4|U((zT#BSTTtt9H>TWX4^b ziK$W)EXyeto4pPOG8Sns{OogDhUTpgd?yXR?DX{$fY>=TtA3MC(-Sg-O%U!0|a0AgeOFlqR(w`1#-AH_hL0h zn6NcE-i(55^YLW^Pb(8R7^V;=(G6XzqVRbm0ANm-%BjMt0ib;t7K7|bJW#_rB-v?YkWijOI^mVwS zEeXJQ7e%eW=5@LeuDDqVau&-I}dzZq7&vCC8oxv~_TugXb@8>NBL# zI)AWk?s<|ZlmO$FHW75Pf)~qt|1r4YeD!1_C;XE3(VaUDDZwp6>pM`=Yni@cWuS7( zSHmAf?=U_{g-KlIq+jB9-W5(aq(sbI5)7y4p%&3mO8C(Lm{|#cSl*Uw!w0@N-EwGG z15qO%(z0%bg6@2E>y__*kTUqkg5_j*>Iu~cf2ZFXpz;9$uO-lSB&fE~h}zL>+X?Cc z_)BQ0lFaYWUCg+bf{5Rv$N@!23 zL($;E%MM^U#O{}P8WQ^BDTzin3@H7Bx;j+vJ=Gc-hBSd1b<$%-X(@SB5{cbhjgQo& zVLN$NTZUWAFv>;%u5?|lx((qq3H%W~7glkHK_!6$2Et5L=y7ZaSx4_7j}R?qP+fbn zGn&uS{Sej8)FlEv>{o#$1OZP8MDnA!2uQ!lO2vCJU=h%jy(cV0wUwB8Pg6+S5GfyD z^23lA^Yv?PHRU9npR^>QEBPsTI(Y&GrRtB=vObEBYmYPAVV@DG$lB)*j0|o8VC*6k zSy7y_y15y{&$bX-1gApRB$&Z@;Arkxa(`uNCK3cHf&xJ`7AqV|(H(7GT&4F}WRiL} z61U5>(!zne7Tf!&H=#Gi%UIuF`W2XSt&pja=$m(J-?f(voL@NWIuq4XG!rg7J&4Bd zIyvk+f?Ue_;QKv(6`MoHOcagvB}86|)tnu`2qD|dJ3X4 zqN>!4t&HmORty3xfedtxbbBwdp3gsT0Vej!TH?Ek>MJiuIWgtXExu?|&t=tPMQ1)F z)?piGhOru{dltXb<!jTL`G&Q)+lBx#{6_4VR@EEHG;Oaeadrfe(iq!Qkz3tRl9YEX+mIvt+Zld+U$=}4`)t}P9Bf_94#FcDLB*0__CLmRoWtb6LMaNr`@9>my3|zEF1)fVRt6R z4}P>f=#Y&>fWZsdh`!`0!T@ICm&Db_XnABI2Au3N4(F-0)xFflO8G3#)T z!q_{BJB`EsgjK{u$5=!Qkh@ce@#yg4aSaMO@weJb3AhOATaB8z4iJNX@~H66JHn@* z2p|NNxXZYyc)|ys^K8_G7{4*5D+JN4Aff<4Bv z?Px99I$Edhf`bc!f8-6AkB+o+k5o+_VMcXhgB`81JfvQF375YW**Kk??$H13^XrO? ztXL^sDXOo)tNY~ggmN>cVW+{>XxF&N=}DbTU5k^3hp|Vjckm4wzAt{ZTw+g1$tUa5 zP01l^lSz~36?hd>4%X+2=Pc0gvqMeay@$hBh}Jdt+!l3*1gGmJJbn%>KtmT<{A>K* z-wWJRp%|dd1#AR-lIUqq4)m2Q%*BwLki-uf2zuQiM&~SV_T&70(+Tb-&Zb9b&~U); zUf#vG=+2Z-U)oqKe4Gj_e{ypo^`|3k0gYz8K`9s&&s(1NJwK57Afu2EFCZx(%Fie; z?eXkM>X8SRTj+M1M2@{9`tIHXUM*P7SSca9W}j8R?aF23oZ@qq6#)h0aYt0|g^CrpAO7xLMz*<4VwhUsIK zz8A;7Cv7Hy1|7yDhPj57wPvuQ&7arW!=*)KaNXSsP6M&lZsYg#{at?uLXKAWi+#Z4DdSg>xrv7eLGuKtG zi>g$rRx+S4nl^^0ef_%Q>#levAET?{U2e5kJTukg)l~8O=+~G|iMFo6g;Ryu!2rDN zdJwZ9EYdg4SNMiTKvwYFf8e^YL80JN&332Xvfp;#J2c;c*nt6#bPh3&Io0ql|m8 zyENjIo;7tnMJ21=eFqn`V`4gPM(du_Sl=S@iE`d-YD?$3e6!BYX=qO0jm)=h zEqk;j@+SR)%8&0)mGKTxOngU7cuO_{i$E`7TuM3SID5u3H_{G50mc; z&?}Qm&s&Ab#1AE1uh{O5Zu-}zJ=#2PdALYk^4}Sq-i}pdn0{>gbw9cfIe`5-{i91! zVP%rqtnTl&#Ifsm-<0M-#M^LCb-?y2&2Q_9{PbwQBw97c5A7o4*VeR6czxv zxIqv=Ac)fm3bTY*0YR>R2MIh9=Z?hXTjpw>_)h&|Gq`QM)19wK++|8{-gZ`l8@K>Q!~2mU4nAWe4**x%~} z4?<+N@y6T{IV5DF@ONGQp(77*cng8qg3Yl(|5_K|!;1d*>jr<5iT?)&$e{mW;s3({ zhZNMo^8ez+(-mTc%?0G3G?#& zZy6sKGGqAuk%MrLvUArp95 z(|^feuI9E55Le`F6d*{`77jtK0T86%WCca0h<_704JZ@_L^}J&x8epfcZL03Ep8q` OZeDCg#@DK{*#95l`7qA_ literal 0 HcmV?d00001 diff --git a/doc/performance/Tesla_K40m/SSYRK.pdf b/doc/performance/Tesla_K40m/SSYRK.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ada28248abbbb7fd3a7ac79334e10886f5572176 GIT binary patch literal 13261 zcmb8W1yGzpvo4&30Ko$Uw}k`?vbaldcb5g0#e=iJ;_epQEjWar!JXjlkU;R@?wY&g zlXK2@{{L3ptr}k5o}TWR*^&*`Yd_>oZOx$;0A5xeZU9iy(i#e10#MQ#zB}Tk5IYmor+GU; z;hScI3d{HgQmCQkj$OK{XpnP7*H3OMKtz*A7k_pZM8?YMrvvNoO?9`iPd5^1_deYS zQFXp^Zf~k+Ca`T+OI6h|`xv_miEy<@>dop)nz!=LZMXk9Rj?kBMLF9B=v}mLCtRL665&`ra~s9yTRbO8;z1 zV9t>P>FwrHfLH{61^FK%lOIlR$_2jd&7w1()`31cH-1RoV$!c?-9#mDL!H9`G26$^ z*^_zu1Vwm$|7gWbuziZN{>@J9t9|Scno?u?A9F*CyT`|iw+DB(R}}`Hw`bCZ`w_wh z*Ui>v6b}}2vIinFlV_eM_BSRTZZ~T{;NuP2*J}{<4mL^RmxCWj$&o)>O{Mzua;)3x zt)m}w(UQwulbH~rLaQG<7se~XgiI*rsvQ<8m-HF}9mr%*mnU>}8?5Kv*^wswem~j1 z!_Rd{G40gH5{c=Um%}o<_X$&*-@sxJuc;;Z<2JL__UQ-L{TV!TAo2MNY9MhqL6rJe zRz~;}L*PoA;|-7)c3NjaXTY(&0MX%i(<~{zM?J^VVliGbSI`I_1iLKgd>XqCv1ILz zKoax-=?))%78%P*8nc?`j27(l2>$rxVbtSiOej$bk|U|ik<5ni^&j{1ewbLP^vMD@*`E@X{5 z)2=~!P}uDv{?Ez?Rs;-+nLO7 z7E+rP|NJB(`_Jw2&@?r&Gz=k$GeL5Jl#1OIaSN~=onIHq+QB4 zxx$MNN#>9DXMb)-Dwwf=JJHEXk8M7GMhsouf7z8FrJe0ND1FfIC(2w~!%=ImPwQ zP^fa%EMhC;jOU1P#6T8AB*R987)s}IYY5gOow&42*qfcvL|qWLs8>Xp#_NEK#qU8}}oK zRl&@CvZ;zmQQPgA)VH{EX%wwf=TN5Txj5#aCbcN5@)Aw`5(|;Zehi<4jwrxv+H!%B zr*mD;jV=^SSen zxD)X{dn#RbZJQ@%{gJ3>F~V*j$ye+{R9^rLPU#XEvQ7+=L`n0U_)8uOIcze2Q}XK8 zNA`NY%T|K-%c`{l&>JUpd^^LbUKgq(r?nIPBICBqEU8G@`#^$4{yOnz-pQK5$zj8Z(Ch`C6PblMdTM__Br_#bX>87?q?akJ;YJo`T;{YQ`Ebz575I1| zd_I5-mU=q)a-b(qvyAqL(kO}%x6`^dB=)3@)DQ_Vh6LUu(@0w17pFGMx2LS{h%T{^ z#Z{rE1mS`TW}cChVFKkp6Sa1`vhkhcjL@_yKsYU4#?Xx>zh(p4Th^xWMxex__tGjY zQ%-zfvQGP)MwRw%p)NCMJazcOOFE~dJJFVTl zy;fpJ2)~B4Cq~=O5{G|BlE(_E)3D?0c4|)~a^OQGU}_ld$!|886QgEm#yIV@X8Q#g zPxANW{j0)7>@#{xM;KT%Q6zn{9dp3%lQSC5w!EAI_AFf~$Cjd@v)0DG?tp`P@TMH61M`eGpyVCu)q zqTA!4^jG+neBI&;(%4U|scu$(LUndcs^y}LCZ{yq7GetBTJ)@po~`G7#y{~P)Tdsz z(iZ5$S1G+AS_)XdsRyc83?mc2l~AhI;|^rF0_``>*Kx3M~{ztvvm|Tn+2Ri^Z99KY1Fg zUtOM!#9Ask+hBZ5k0f)||EpWX$F-MjM8z?HBbnr5jBz z?A%^tQ(Z8mCyS=mQs%FecZPTWr%;<#`0=!HY6JK%r{q=qoEf3o#Jbtg;KY}%J1hoT z?Loo1;ocuY7zXpFIP9~%NYckbtV~tc3@L43xA?x_-*svPuzmRVn=MoT5zm_A1y&Ov z9>!p|<;74yYR!zPm8L)2e6}KH!)`wJ0lm+*&@$ez+~lu24kcx}72m zi-ZSiH4d_Wz(&CKf-3!|@L1DtOvh->+* z%7Rxk0`R*+_+A2uscXf^!nLA^C{&}c2mpgSjo68 z&&Wim807B=TF=N-J|}R6otu!@Tm^=GImg-6Nn@OWzS>l0^X$_eD-T|!eh-k-np1|m z+-dp_L~oW%D%hKb1MojyfyC8NXN;bRU`7x~?BuhbxgnKMFagF%O+&Ep9Gyis^%?{= zA^nr`>Cu1i`h@ifxht?Ko}l`U_`hD|-W`oowD>GypU7zME(Ln_x@mDR(bz(S4xpCS zYqkMGo1Qz$%dc+p&kCxh+fZ%;;m`F(l-mruZ;@Q>D7T9fR+&qvAJ2^lWJ5!#vv{0cW2nXjOW*VcacFK(+angS|$%K}&Vq@c#6#NG|b= zJ?lw*V1>(b<2}JSq{iA9!dn@7@5p(j=-4H1T8J>T-UoFEl5M?kh5DjkaFpm;8V7i1 zu@q*PS#uMPLq)vi-klTyG9i6MZ0}RGlV{ewVaVnoW@C_opqY`6IM86xJxv1`dWGimt#KG>V@O<+oBuRq)3K?yz@5~U*-t4mVL zou*fJ3#7wE<^tFP}8DRE8i6jFj4Ac-L_|g9=4EuVKFrp3o zHQrj6dXJ_;Jd-CWK!=-gUl`e2rAhW#l1|=BK+YGs|P)K0f4ucvLmRml*IPwOc@>CD;@l1p)B{ zM==9i<_Q-)dP_2Sho`6`bx+8eR=tdQ9MB*W+}7aSJM{SnKT5(%^#!A#QY85i4V6D( zw@W(%yk?`UNymEJy1KPSQ zx>d)@5`*WpD7tn?k3EL?8?^N|)4nJ9Aq=UUY|UCk>Ce~I-ZrR?lOs-CbTs@BN(i89 z=qidr`(^s-t8K@*EE@uis|JPG(se$P;f_^@vDugAVcIUSDZt5#eJ~$Wam}$f4+i0o zYqA|%EIGP%{J`uYAqRyt5xbGXpu8L2+_hCl@sKmC(b_g$3zi=GJPoejzz#WB@z&~7 zae$2gxQrp+gvj2kL%gk7AXLwX(`fC)CuKBV^}ClXlKe*VP(XbM@w-KWkHtF%^cr0P zRvqeaMR!s|3>+v=i!mlasn}rXbG4hp)BcbcO@v_ctxt)KCbAlkw@C7v%|la@LMRLr}|&n00F+$C)g_&E~B;U+^rSm0%DfL zBWA-<$zqW;kwNF;kD6|zO)Y2Ol0SGUvSaN>95`PQD( zt?EBI|B5byf{5Q`Db5p?A)uyrZ2X;2Jdkl3;Lwh4-SJ&Q!ugm`?XP#8$cvw15<#=% zAB#h2t8sIw7|h}AO1-#PTEHzhT$Ut6ITnnDk@`55`&p+ZIQ;lvR(YZ&J~&+d@iCW5 z{_+U#eKqPYy!WNwGEnd|NQ-X!bE!PupuMCw@Tax23QSjywbb^HwS)!7S`EK>JEQFw zDc>CjZ#D2X32$fch7%><%^tOXz&In#nWUe$e%IWKj7FoO6D~lTqAvF-pp1RFI0J~r zJwUlkmk~2yu}Zy4x9GJ+y}lj&ItA}CArE$ynVgUZh5zJHKjoj(#1ttMk(+@K?rL@) z_I zm}|Ai4KDUzgS}@`kB0vuD#G+VokbmG%SnV7{|w%BMj4beq(E32cxK6nK~mip(|VxOZRx$JW6^QkIj;maic7xaQ>Kkk%oJq`D?GblXd1eWLc zk~aH(;f!5{jI3F~j``1ljqVen;Ij9;Ph&5ZWdI?1j2Fu^{9Wi{W8an;&O}e_i*RZ-L_w zvI`;??>0UOY{5DkQM7ssTz@ejn6v*NZRp1P<&WMXhsUVI_d*KO0r4v{ajReZyHt}4 zW+7~7X;d>)TF9A()yGq!yp1w>{T^EOi?XdU^_T1RCSSuS0(O%iHEA_Qgwbxl?#kN0 z(-U>QC1&bKir`&)9G^EWN~8hTKjq=Mi=W?gW4z<2oS3m9SBsMTQ7^A>p+=W!JHK7rf_Xeg1&PMC$cpj5vk>CQ031b=tt4Gv}uihk8_;>>C|P!CO+KMU0HHiMHzU?%iL=GgvOOTM%g$%zf__5DzMZ5ca+DGipsG){qa+0o=8y`XFtb9 za*mo5|DH)W*FoI3Vwfvqh{!+ktHsT>{p`a4Coa6B=$qHQhqvJ>ee-GZJOY-2^;?1= z9;+a`5I(N*JX^;C2a9um6&W<%MGdb+!g_~!ynT9vyKDD)HC&s@%vsui=j%+kT}kv~ zAafGOg6*uhv$A07Rbz2V=u~rl-zDkBSBzi{_`&JehBNQ$@Bz_zJN4B*TXQWkMPhGO zoNdUfIeh220dw~A@O}FF{vCGr)6eV87F`(Ww71N8C!lm97S?SUEYZA}nQkWmY^Z8L zV5rUFyl+?;nxP%55j?pNaE`1Bc-}qWr+}Wq!_-9cr{pg%Z1XajeX1218Ht^GE!9HA z5_`m4aTux<5O`-FxJ*jXDyuAkkuvC-4E$=&Db2Irg4=;E_@VJ%0J68fFzQfE-cWI_*l3X}|3H!Rm+I`&IK|#9 z^-6W|FaM&Vz7iE968g~?8mZIEi}$#7|G=O&ZMA^F2JLme?;76+syqKK09kk`4+-Te z_KSlv%|vYf71Z}?;V zgiPaT=QVUK@j5IgKCuSgJl=0ldkDYGynPjt6AwmXg9DIAkxmU#CLDlrrdr~WGT{Iu zQbgI@zlEIn#orY%GqUF(lajXL3Zl9|DY%@vZ_Fe`!;edXy%^gUT^JtJtN2MyOqwAg zvr9q|qq1_IUVnqBoY4gpgc+9|)T6jcrQ~K>rPk2DX6jl6(v21h-J!?8%qz ztSkjkReSP%no}q-vrUK5IKmGMg+?07W_|Xr3SVoTndhvn;u$P6hd9-KY{=?tIga>q zHR8-FG@9}hy#>L=9t#_?nsBi^S0j0{Q==JKx_;QM{#T+Nnz=kj_-Jtao=79Tu%}f~ z@m>*vM3*vhTCU!1S^PcJ!GqCj?Va$F?)6FkeqK;w_8}ZQT10PlFbj^1>T$iB#CFb3 zP&bSC6WRijm9F99u|9@`9Jm%{IQJ@CEJQ(~<__0tUU=$r<)kjslbhoYy$DQyvSg@c z^<=?IAr;;`)%4!Oxbp%Wy0g6;(e5g1#b+G*lSI>HFD?Cxd%bKZ%$hpbi8RV&z5yr4=l#kU#t&?w3Xq+6jK^^6^#G-Usyqu`kx;xU?djy07Dqits9pA6Wa7rU(WhXRh3MG(>;YV3fR?;aN>01t0#Lt%p zy3&_EAEnX}KAUhozVFa1aPuS=5SFmol+U*~qIx`!fRE3k<}Dv!#C&_KpPySKeWXfa zxSBzKd_O%*-9>`jxU47}!nrg{L!a5@CCZOcI*6LNC1n{MnV`5T1Wv+jD*Q6;=51_Z zo*O$!&yhvpNhCyFHi$%!mHjGP(Lt53Cw@z+x%*u~I&AeuB(>?&Bz=~E@FH2tmDB|W zP(T<~ubq*g2)}&`Yd7BI7KW7z2;Vox$4}D#BE#~+oT3+b%_0~aox zycXV@)@p&Tb-~FQJ|}pRLZa}nmg)jmi60e}ErPiUBpCLcE>mqBT z_f-g}p=UE-Y;2yH=3@lMVloE2PI)OQ!52-EnQ7z0Ht|rM$c>2sd#pT(Zf&e|kYB9q z^)s8(Lhy-#6R`%-La1r<(s>87Yn`%y*AOBhog(>b=LP2K3rk{}&iUNa!EoB&C0*is zc-F5LFk}(AezKgBm8vwOT!FEstj0x?e={`#JWFDK$`Rd-+j;Z`b;=h>r}mJz@|XKQ z%4>ouI#@V|QxlbP!$HP|3S^=M1k@@;Avz*k)2yqgfhxvxW@Pvv{vJUBMi+)re_!PS z+ybPCjWiAzO>qj5iMn;Xd$a*%vjk2m_Sh<2cu&ap?@->L+$`klrn6c}D)!07ECKuS zx&kZkg}6FXc~Y^JL;PTu={GTL-kBgf%=5MkP_c_NJe>$Q&R}w}5DI{?|7H)xOB>6X zyde5scur;hZ2IR!dcJ-MBF@gcw7pw_h=nTZ1a>u*n zO$RDB8!77&S5XHg8M|!szpdh5CFy3Brs~irC^nV2g)dD{7U!os?mon<5b} zhK2}a=zg_o`lL%5t7zPq7D(h!#eF?`k3DVL%`vug5CzAtkRV*eeJQ3v!}kVc0%DFI z;e+2xjVaR#XU#_&ybIH56`DpecI%mhLcki9?g2_)cU2`@MYoYV%jIW+qGaN))?FP3 zCQhLRak$r)(osp(k*}CGQLv9TV#|K|Nh+>vxR>30pfW#}!Q}PA&kGfMztjaNis33) zML6lB9R4G8?vN#Wta;f$+r{3beI5Fo_^nz5^6!RFFcu>M8&^-6 zH}ZZm;9x)9LP(T`aQ-^~%&)Fr`8AE@?~C`b*H&;Pf9OHhOWd;3WOn*K(4h03!a zxtAth;GzeJ`*rsVnfO zI9$nH6PxvGwbJbL?!%=^R5yylu2>1_e)BNBao47<>PL@)Z+`GNGlYgwqvXvpwsn;Z zd{Fa}9lPEfq5v)C-p!N=9~j0DE(>SA&^7Zsohh3W4)VB~qR8qLyvTiM<}vmsJ0W=l zwOCIlGu>BL01aDj5n2%i{}{G=J&cqIhsE5|f+KDm_)23o?h%uvVnN<`g_uHD1A1bD zo|RPybc0wybUz`hI8Akv>-zgNcxlDqf;Q*N2+zxq?l2=eEFp2PF2RIilCQCXt49bj z5Z?epbVsRD5gNLru;tPpQ@)TA^SHhxM=6A<9-DctYk8oVzvD0>F;%iYK)OX^7mYjL3n%);Zt~Gs69in0Gj5=Fl z&(>3i-~CJqskPs@d2S5XkgyJ9A>1SWIBXe8X&7J~ELpzSE^x%PC$Uldng?_J5~!%w zeoJ)e5wh-7>^vV3*2;**^|*uMKsUPD?=8|s)vcl!sf@yZ5P$6p_-cRt><#y{&1*B3 z-}Sxfd{yE%WY5W?r+=^X^*0r-uD4(hNw2Cwj-(0GsmE8ERmePpLO7ydxim(^j#wY( zpW=~6b9`5M;oYFt&XLp2&@a;}Lmd$HP^Ucbl7E1Evoogu=W*8=8U{M>&&ti4L42}k z_zk|mMbMpGFabXRCOw1`ONPD_k_Z2DMqY0GdHjAq!Lw3>IxRzRl8aQLt=aReq$fC; zXO-M~lPFQb0#gbe%wmZ$)u2l?;4B~)4ykx12zc_8!tWzolu|H&9t5E3QP-+fizBwp zq2}#l(N<3@wd8LWYL}Zk_UiCUlkE(>qkL3WC0{mgflZWg)!*=@X=1QsrdO~v7VZpQ?P^gX(1Z%Tib4Bx` z)@Yz4ny-4qP+IK}rg*5_@YoK}TLZsHH7*g@zvVUyZ`86S`EwjHfXD1OP>|v%S8gwg zqL`fN=ohhgN)T<0<89y&Pa#l?5b=VbaRXhxvEcQ!n^czWeaG8w3FfWyB7Gu2q325Q z<2@?D&5xYX1N&cdM$?!BTIz>8I?5g2wigZFG}}%QgCNTzwN=Eh@8OdbjGT zoXtT@5gfg;PRgyu<&Sj`^TF23Yrkc|aAfTC!u1q+3M?!ueH^=pne%pqVn!W-|2-F{ ztr1CT7F%IPD$)zFk(f81{iuXDTrG{fO*}Fa#B=PQm5B2)7m~hsc_Bcg_6qP^nd^g% zIv9i7U{CQ%Dv4I=GLY;U_#o9W#srwQpqJ2XCg?PdT_r`Ge;D+3nUYIJQleL*=Vq28 zW@O?+*XUbuu*yL|chEdDeZ}@X&jQ~u&w{DP^iK}8 zecn3nGPDh_|B-%7-WroTFf)){WdYKM0mtM#{ni5%yqVOvNBMvgwSE-TU-^J4Xppng zn|Tu2!`VXmrC8Uk)?v?se)CYIKP+s&sZSp4I-`?Z1UgpUA>M>pg5N!#g~$&|`!mFD z3AqOo678wb56c}>et5grBJQ(7I0Or@+W_tSm<-PDtbv4@u`L#$L*K_N|1ZL*{9DxUo6wk3gqK4t z-$CqWEwux=t?$Q5&*I>Rl+>2?gd5V>pcJk17_Ldqn41hdO3M4$=vs)C{#LfeH``I% zlHN(&vZRPqC2i$nG~LJ+r&L0xXFrMe5A1^Q-=ULVjNsknzeCw}JorCztY0le(=q(v zlA4bAgG^R%eGg|S{gdqTPp;Av73m*Tp?`6e{y|X!%2}E?0rUV*)Ff2^2u@+LbB0rq zSOGuc?M zQk4<~aQ;jEzlc;%q9@mSKp8PHQLvM#34rTK3-m8)*3;X+mMikag|dBe0u(oOv4ogD z83iiB=~lphlBU==dAa^SNmEaA`ZsARGs#t5Z2>=Itd+Y}Iqr3M!uUrE)fdtjtB|&X-XF7D$Ck{Gyk?u8eYy`o?2^|;@kP(oezUsO6B-p^ZiM_C{rDMz^|KZGni^63 zYsnX>FFfZXty6KY30j*l0XKK=r-UOrpZ$U7g*sHM&y-Ulw@$AY^oa-tcm%aFsOyym zQtCJqMp2|#y3HBW%YnSf(?Iq68lyQw+2W$U-1F>@^uX<-+iXl9F+6>0ZZYW}^Xz+*Y z?6hmF?#KLv#-y+x*VyAp+Aj(yi9dLgKcjn(4M6cQVHiL1LhF@6xEYCcHUe(P%eoHihG$qS2roqFLSH%< zv99>P`Se^IK=8)*TMl7@f4LEDF_Lo*Hb(%LJt5Q&vckcG%U+)c)gaoUy zd;;1a1zynF=cHWCT&}!Q6=dbiVxeh`X*w&^3v30+kDo^-Mz^0q8Tp7ZBbX(%a?;1i zwrjSr&v|a~kOFcck*v{#KJuhAcr`C-h-yftqIrXedLzMwRoY+pW=Z$LF?t*;jjdVh zP(5CI;CLYNM|%alU4z=!@fhk0D26h8*I3cDr?uCz$88{L!TI(2C=yOYJ03bQy_WRE z@_r>8olQxH8;SBYP+Igom?<}IT#`qwK@KMJ4(B3N@w@$J--cjq83AztB~t1(Y87d- zcU<&-?tD-fO>qNq784lVO?;`d(iJ zRv4{fg-BRl4{|e0TgC1IFY_@q`V^#cYQ(nkM81kcK2R)755erAVrYK<4nb;c3Yc9>rOHSFrhIuGiK`$Y4_;a)a=&ZTG-fS z+Adjm^#vQ_TBun}LC(s`Q<>)KdM|(IpXx6+mTkF=dD)uV_SOG{xK3qRQ5d&(w#_{T z1mp$u=Z=6UCpuUss%B4+B6?9FHfEWwqWo^W`}i1F8zl ziaBfZOW8|$yYD~8n!kIDhpgdlsvS5jYmIR))X%sMj{UL=TBi4@_4)LN>kr9u-RFzG zzkN$S^mQcqd5PrbAd1Y0VEK>uOLPiS+DjYtUw&#n!`MRGat-ny_Z>gTz3PhWP73m( zh(^Iet3dG~1mmi_o@n=NG8*ttLad-_r5d6-5>FME`RtyDmxuc~JNN+YcQgpxQhL`&lSnHl>!R>0OCWVEOEvaX zy#rqzEvx9ZXttnD5l!J|_9eX_O{*Y6a%_r&x-@nn${pJ`_A|DSn&F!8n*5=RgFDfg|TVh@8Z=4Y-#IBYVl_@+T*1ci15`Hqb`1Nb3}RkXsUf6l#IwkdDGJ%OX*XK#N>_0f3I0^tO3*uBhbd}>jT zFvB1HyWx!m*LK)F8w(yk=cE4l{ZvJ|VR8HXpUFehBk273gBDSRnL%=kijUI@)1J+r z<`h?4_Qs>CBL?9V@9i7H^OM7(NTqCVgsZ@_yZ;q+^Zq*wMg3=mfP^iajBjZRFA;bO zTmS1#%hFcF*2(f;0#u;5rJ0#2ye!~}_pb-w-~`%O+B!P{c)5TM&UR2!Yg02QDk}#E z(8La2%HZT=2>_Zqf?eP-E*Rqc6zW1O9U;y(X4a-K01(b4Gl7e2zz|0}TL945(ez0X zJ`xOpz$*<it>Qyy5`U8dpqjHW^l!<;lMtS5D1vR^Q-UW7 zfA{4dI?|@rE~ZdR2pARkuYCbL?dX3msrZ{r{69E=2mMbQ{~r#RMD46i{x4o&j;3a) zEC5zERF;2#09>4$?3@5Iz`tc|@Wkfn1F-$KjFk(XCH+qsFDECwaO6K^TyRVODdXmZ z*LVDf9yjm*_QA#qPwxJw9t#`qf67?dIC$V$(f`n6W#i`f-+DY8@LcU*dQeBOrM0Oe zyyOM|RI_w9h3^3XC}V482Tu|ICUmNHc2K}mCCuM%#R&>_g#O(vRyJM^HdJbA2_;F? F{{!#5*b)E$ literal 0 HcmV?d00001 From 57c705dbf22b8f942337c1217aa2366a05312250 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Thu, 25 Jun 2015 20:38:34 +0200 Subject: [PATCH 11/15] Clarified comment --- src/routines/xsyrk.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/routines/xsyrk.cc b/src/routines/xsyrk.cc index 1f645fd5..d8c150fd 100644 --- a/src/routines/xsyrk.cc +++ b/src/routines/xsyrk.cc @@ -86,12 +86,15 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const // Loads the program from the database auto& program = GetProgramFromCache(); - // Runs the pre-processing kernel. This transposes the matrices A and B, but also pads zeros to + // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to // fill them up until they reach a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_a, a_rotated, a_conjugate, true, false, false, program); if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, temp_c, c_rotated, false, true, false, false, program); From ff9f9fac57b550d4a8834a5e5b344c2eefa9099f Mon Sep 17 00:00:00 2001 From: CNugteren Date: Thu, 25 Jun 2015 20:39:34 +0200 Subject: [PATCH 12/15] Added option to test only symmetric matrices (m=n) --- test/correctness/routines/xgemm.cc | 2 +- test/correctness/routines/xsymm.cc | 2 +- test/correctness/testabc.cc | 3 ++- test/correctness/testabc.h | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/test/correctness/routines/xgemm.cc b/test/correctness/routines/xgemm.cc index 4129e17c..b829aaa3 100644 --- a/test/correctness/routines/xgemm.cc +++ b/test/correctness/routines/xgemm.cc @@ -76,7 +76,7 @@ void XgemmTest(int argc, char *argv[], const bool silent, const std::string &nam const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose); // Runs the tests - tester.TestRegular(args, case_name); + tester.TestRegular(args, case_name, false); tester.TestInvalidBufferSizes(args, case_name); } } diff --git a/test/correctness/routines/xsymm.cc b/test/correctness/routines/xsymm.cc index d769177f..0e02e9ca 100644 --- a/test/correctness/routines/xsymm.cc +++ b/test/correctness/routines/xsymm.cc @@ -76,7 +76,7 @@ void XsymmTest(int argc, char *argv[], const bool silent, const std::string &nam const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle); // Runs the tests - tester.TestRegular(args, case_name); + tester.TestRegular(args, case_name, true); tester.TestInvalidBufferSizes(args, case_name); } } diff --git a/test/correctness/testabc.cc b/test/correctness/testabc.cc index eed17560..64e02523 100644 --- a/test/correctness/testabc.cc +++ b/test/correctness/testabc.cc @@ -45,7 +45,7 @@ TestABC::TestABC(int argc, char *argv[], const bool silent, // Tests the routine for a wide variety of parameters template -void TestABC::TestRegular(Arguments &args, const std::string &name) { +void TestABC::TestRegular(Arguments &args, const std::string &name, const bool symmetric) { if (!PrecisionSupported()) { return; } TestStart("regular behaviour", name); @@ -63,6 +63,7 @@ void TestABC::TestRegular(Arguments &args, const std::string &name) { args.m = m; for (auto &n: kMatrixDims) { args.n = n; + if (symmetric && m != n) { continue; } for (auto &k: kMatrixDims) { args.k = k; diff --git a/test/correctness/testabc.h b/test/correctness/testabc.h index 2c44d532..41100db7 100644 --- a/test/correctness/testabc.h +++ b/test/correctness/testabc.h @@ -64,7 +64,7 @@ class TestABC: public Tester { const Routine clblast_lambda, const Routine clblas_lambda); // The test functions, taking no inputs - void TestRegular(Arguments &args, const std::string &name); + void TestRegular(Arguments &args, const std::string &name, const bool symmetric); void TestInvalidBufferSizes(Arguments &args, const std::string &name); private: From 75f263ce3aa35f57115670af18719137b3020e79 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Fri, 26 Jun 2015 08:10:23 +0200 Subject: [PATCH 13/15] Added symmetric matrix support for the ABC performance tester --- test/performance/client.cc | 13 +++++++------ test/performance/client.h | 2 +- test/performance/routines/xgemm.cc | 8 ++++---- test/performance/routines/xsymm.cc | 8 ++++---- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/test/performance/client.cc b/test/performance/client.cc index 65ff3218..b089f925 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -239,14 +239,15 @@ template void ClientAC(int, char **, Routine2, const std::vect // This is the matrix-matrix-matrix variant of the set-up/tear-down client routine. template void ClientABC(int argc, char *argv[], Routine3 client_routine, - const std::vector &options) { + const std::vector &options, const bool symmetric) { // Function to determine how to find the default value of the leading dimension of matrix A - auto default_ld_a = [](const Arguments args) { return args.m; }; + auto default_ld_a = [&symmetric](const Arguments args) { return (symmetric) ? args.n : args.m; }; // Simple command line argument parser with defaults auto args = ParseArguments(argc, argv, options, default_ld_a); if (args.print_help) { return; } + if (symmetric) { args.m = args.n; } // Prints the header of the output table PrintTableHeader(args.silent, options); @@ -314,10 +315,10 @@ void ClientABC(int argc, char *argv[], Routine3 client_routine, } // Compiles the above function -template void ClientABC(int, char **, Routine3, const std::vector&); -template void ClientABC(int, char **, Routine3, const std::vector&); -template void ClientABC(int, char **, Routine3, const std::vector&); -template void ClientABC(int, char **, Routine3, const std::vector&); +template void ClientABC(int, char **, Routine3, const std::vector&, const bool); +template void ClientABC(int, char **, Routine3, const std::vector&, const bool); +template void ClientABC(int, char **, Routine3, const std::vector&, const bool); +template void ClientABC(int, char **, Routine3, const std::vector&, const bool); // ================================================================================================= diff --git a/test/performance/client.h b/test/performance/client.h index edcd1b68..097ae048 100644 --- a/test/performance/client.h +++ b/test/performance/client.h @@ -56,7 +56,7 @@ void ClientAC(int argc, char *argv[], Routine2 client_routine, const std::vector &options); template void ClientABC(int argc, char *argv[], Routine3 client_routine, - const std::vector &options); + const std::vector &options, const bool symmetric); // ================================================================================================= diff --git a/test/performance/routines/xgemm.cc b/test/performance/routines/xgemm.cc index 97e19b44..76e398e0 100644 --- a/test/performance/routines/xgemm.cc +++ b/test/performance/routines/xgemm.cc @@ -96,10 +96,10 @@ void ClientXgemm(int argc, char *argv[]) { kArgAlpha, kArgBeta}; switch(GetPrecision(argc, argv)) { case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case Precision::kSingle: ClientABC(argc, argv, PerformanceXgemm, o); break; - case Precision::kDouble: ClientABC(argc, argv, PerformanceXgemm, o); break; - case Precision::kComplexSingle: ClientABC(argc, argv, PerformanceXgemm, o); break; - case Precision::kComplexDouble: ClientABC(argc, argv, PerformanceXgemm, o); break; + case Precision::kSingle: ClientABC(argc, argv, PerformanceXgemm, o, false); break; + case Precision::kDouble: ClientABC(argc, argv, PerformanceXgemm, o, false); break; + case Precision::kComplexSingle: ClientABC(argc, argv, PerformanceXgemm, o, false); break; + case Precision::kComplexDouble: ClientABC(argc, argv, PerformanceXgemm, o, false); break; } } diff --git a/test/performance/routines/xsymm.cc b/test/performance/routines/xsymm.cc index 0b1d75a5..d78d4eb8 100644 --- a/test/performance/routines/xsymm.cc +++ b/test/performance/routines/xsymm.cc @@ -96,10 +96,10 @@ void ClientXsymm(int argc, char *argv[]) { kArgAlpha, kArgBeta}; switch(GetPrecision(argc, argv)) { case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case Precision::kSingle: ClientABC(argc, argv, PerformanceXsymm, o); break; - case Precision::kDouble: ClientABC(argc, argv, PerformanceXsymm, o); break; - case Precision::kComplexSingle: ClientABC(argc, argv, PerformanceXsymm, o); break; - case Precision::kComplexDouble: ClientABC(argc, argv, PerformanceXsymm, o); break; + case Precision::kSingle: ClientABC(argc, argv, PerformanceXsymm, o, false); break; + case Precision::kDouble: ClientABC(argc, argv, PerformanceXsymm, o, false); break; + case Precision::kComplexSingle: ClientABC(argc, argv, PerformanceXsymm, o, false); break; + case Precision::kComplexDouble: ClientABC(argc, argv, PerformanceXsymm, o, false); break; } } From 7c8d16147a2e66de0ba11a18cbdf062bfd7ea66e Mon Sep 17 00:00:00 2001 From: CNugteren Date: Fri, 26 Jun 2015 08:12:56 +0200 Subject: [PATCH 14/15] Added the SYR2K routine, tester, and client --- CHANGELOG | 1 + CMakeLists.txt | 2 +- README.md | 2 +- doc/performance/Iris/SSYR2K.pdf | Bin 0 -> 12435 bytes include/internal/routines/xsyr2k.h | 48 ++++++++ src/clblast.cc | 4 +- src/routines/xsyr2k.cc | 169 ++++++++++++++++++++++++++++ test/correctness/routines/xsyr2k.cc | 99 ++++++++++++++++ test/performance/graphs/xsyr2k.r | 94 ++++++++++++++++ test/performance/routines/xsyr2k.cc | 115 +++++++++++++++++++ 10 files changed, 529 insertions(+), 5 deletions(-) create mode 100644 doc/performance/Iris/SSYR2K.pdf create mode 100644 include/internal/routines/xsyr2k.h create mode 100644 src/routines/xsyr2k.cc create mode 100644 test/correctness/routines/xsyr2k.cc create mode 100644 test/performance/graphs/xsyr2k.r create mode 100644 test/performance/routines/xsyr2k.cc diff --git a/CHANGELOG b/CHANGELOG index 25268610..3f9cb377 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ Development version (next release) - Added level-3 routines: * SSYRK/DSYRK/CSYRK/ZSYRK + * SSYR2K/DSYR2K/CSYR2K/ZSYR2K Version 0.2.0 - Added support for complex conjugate transpose diff --git a/CMakeLists.txt b/CMakeLists.txt index a8e756e9..9a1ec976 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm) set(SAMPLE_PROGRAMS sgemm) set(ROUTINES_XY xaxpy) set(ROUTINES_AXY xgemv) -set(ROUTINES_ABC xgemm xsymm) +set(ROUTINES_ABC xgemm xsymm xsyr2k) set(ROUTINES_AB ) set(ROUTINES_AC xsyrk) set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC} ${ROUTINES_AB} ${ROUTINES_AC}) diff --git a/README.md b/README.md index 72a13762..29add625 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,7 @@ CLBlast is in active development and currently does not support the full set of | xHEMM | - | - | | | | | xSYRK |`x`|`x`|`x`|`x`| | | xHERK | - | - | | | | -| xSYR2K | | | | | | +| xSYR2K |`x`|`x`|`x`|`x`| | | xHER2K | - | - | | | | | xTRMM | | | | | | | xTRSM | | | | | | diff --git a/doc/performance/Iris/SSYR2K.pdf b/doc/performance/Iris/SSYR2K.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a8e82a4e0e182c255e829e7bc61b9b2ec12870d6 GIT binary patch literal 12435 zcmb7r1yEeu(k{UzxCa{uZiBl7cM0w=41>cULvXhQ0wlP*B|z}t?!h&<21y9+67q(e zd+vYlc~bSJYR|6Kt5%Hzyw^L-1zsR`7W6w22!gH-HNOwY0|+ z6T<|`xIsWLJE*fP2nGQ#%Zl)Dar1Kt@Ca}TaS3nK*0^< zVrvI>2WWV}Ts&a#WtwhKD-STl?Qxculbe$lF8kvP`Y2+-l#sxLI9okN!SnBE06-mY z7YG0-1A>7Zp*8@ZCddZj4&a3=XaW9bf#CnP;8E{U+YRCj`rh58VMmf0hHa zAns5PH!$2D-#_d?w%g?~T%i6#q4Ttvot;2?nj>ch{qx zPl;t-ub4*fCg1o?yHuoxc{zn=_;@-jnN^y#&#m3I3AZ?mEDMw$jh<55NwALBo_$@# z2ItdoF+bg2WSdrLv2NJIByHO+gGPUHnH2Ybt~OxB)#~c?gV%{NN-#FHvI`2o(=W0IC|zgND;Z&htK8f>PogMWcw46!P!HB*59s{x)BrFzuG{~+v++oKP&)_cvN0C zE8FOEHNkLzu*do@RPj`&6Ir7jff*md=`)9!p&M`8aB@|obBfeYnQ{Ven z88Wqu_kZ@>%ypS3tQn`A@2UH@dKCV?YV+XO-5mk<53e>~oF(koeIcwMF#kmRWMU9W&mL^a>&$ep50 zb7=66_?^tzA~aQG#7~m%XyblHZp3e8%_kN5E0Oh7TIr{DJ7cExiILZJhrh?~zJ|5d z_WRV;!LrJStJBM}-@g7`zdQVVZ&u0#C2w-p+5&#v4exe{HGTTDVjlmzO-`T}81S$k zcy~2kTF~KlvD@KyGrC>Ykn;QCbP3vVAK|l7!meDhN)^dIfA?ikiLNW%Y9z=1(!2ws zqj|ZbxU#kN*4giFb`2H#+TlkCjyT292aJ{TW=)C{1(6k{mA&s_W+}X8;6x<`*UTHC zJdLb9)4DlUe@!tRM1lc2REN&{szzr&pV?S`kHGKrLUazI^cr#|yKLA|5wMt;aMuvJ zM3WbyJLv44cE3%vZ9pSy?-i$J3pdjhPvWKJcYEJI8!ICk{x(?7_(GR8Je(V=OO|4^ zjH?*U?!4_B@TrEYIE~)qh6exZuV+;LTC`QKLt#utdmq%OcqVFdamCy%L8=2x8|ra0 z;h{QA>3(}fleJvM7qYog)JkxdI)eGz?Pl7P#p{b=T>N*IqmSS$*>x@hO4&{>>rh*bglyZU>Cmb3m9_fr zXVIP9pzWm|E4|m3g)HV&$yq5lt=kFtzb-0_T>aQge^^+Mtr*Hy4^ zv09QhiZ)FWTCNxSa<$sPrv8Y(MfP{!J&t$j^8R&E+R`Tmsb3cj?xQBrR_H%ToD|c^ zU1hs`6?~ z-k@_Adz7Z`_8SXYBq00X!l+r`p~yRnac&Ea_)myUZ$yrodQ0qy7g~6!s>Ts{Nj!!q zm5>6tQ)o?jk+7Y$QSB1%A$&7!R5E$;CgwQ`8tTStu<=Q1>Kp^k%n+2!f0?!XSsT4i z^GM+Ijwo$8U92_b^YEis6bFHLRkwvO-IAjsAiG%@aWu6SEDF6WRyUc_oLJG|WSeZ_ z16khJjPT$Jyj2B~C+Ei!AFHC+4HPjfMI%d)s2wt0I9bd0RCogE5aV#X%p2ntbqu~Y z(6<_)mofLS3WdlT$idph6|Vy@nlk>P~th9YRxImbsK44fib)>XaB zwF$O8btK0J^`RM_k~_&7xJHM4N_~FP%T%dHjL!BvZvx|+duC$BRJ`rt!NSd!*Ip@`vpMwNXz z?{Y?X{*Y{16~T{G9SPi5pm5!UNi$UvcE&G6ddgJRYq>q8hQ7oS6C%b|prU6lI|oau zv~E)w@qY9yfcU~Sx=mIy@acs`*VPQT{n>pwDa0|^mTlOu*aV_2$ahQ)F#Op?99R`V z;U!1x<+2$CuE$cKZsk4jWSiJNXSwYYFOpSDqu_}76~^)~2@-L0upgj&$M5wCWHb`9 zOqfP~ZKo#;7e|VOz%pB@Q7l;vZctUksagUh29xG?K#EWc3CC-Z^UvBLx1})R&mb(g zpO&JSVfW&0N?gdp1+tAR}u%xMh{OIg4f3X04 z%)+DNmz%N(#~a&Ex0roQ91UB(UF6QZ3MpCv+Y9Vp!gBJl)NFK%bu#0l26|y$kZjPl zAiKute43DNGX6=+SyB>g_j9&KSc3W#+d@lz^Z_sDp3pksr^LGMKEL}~^FWbyue$@g zv)WfLZYvgZesf9uy3mkZs(8vhFgo%!Tp^kQ@vXp_o41&zltipeZv>AiO^D!Hk0|8_ z$>UgNv{&Y@qh(IUq`uA}MXtwyn_h%%A>F>v^7}-HJj{x`npeHNhw|oqpW4Tj#npAx z$^Hx=_$;)S0Jl8~k@)-ih}R&>@=j2!3gxCdq58l} zr%vr?5y)$+b6yasZ6l_jDO9EBxPNlL3odlEqQT|F?x9eCp`}D@#5|wzhXW0h)F8{l;%r#(+VCOzqe=EW;bTbua$CY$U!}!&qX-EH!cTeAMD){})Fn9a@ zS0PDvA;z^6hs>Pjr@xN#%(o&wgxX(ERJsQIz_fmVkMC!Q$E{l`yj?!fz+y|1-jFKK zC+;hpJN1%DM7Se;&G|MaD!8}*Fa+I_5UvNFy2(zxs{Ww8!FJ0qhbT(ph%Cn-K60N# zMb%W3sJ=O0YJJ<)3UcSW)LM&97)wTLCgEAP-$rp&%JTKn~GL&r}El0XW_ay3e7!+UIeA8%eU zl0zQ`{)hH^?4N!vKUI3dRzZ;JjL?(R#y7lnrxfxue!HaF@jbJtPRK*Y20~F4ey8xd zvlr8yh-^;6S@dR=`O|vHK)2kdy@U?-1OsJ99o3;|CAJNo&8KnUwW-039hQfoJLJZ} zu$HGC3f;b?5d+o%Q^M@ z(fd9apKFST=wF3SDBlvl)}Z)mJ!^xTyES<^@|41Q8aT>(PxfA8!MFS8Cp|F|lDmgZ zeaC>fCFJ>o>^OsYvJlZWli%O#j)XMC?p6(gAbNo{jjBqVDD0A%FIGPw5MC+j>5Dn~ z(l1%=+?Aqpz2O{Ii`DX4&R8!8WxvN2_nR{4eLoY~A74@{(dA;WO}LZ$e2l4wu9Lp` ztprFAhn_U!8>FKOAz_c_mi|;|&#>mLOb_OIsrxMk7LJ%o!h3mQi=Wr)W`<%LhM1C0 zN35MZ^vDt$H$kYOD*U5yK89{PdkB7;sn5^O!RoVyxKH%N*$C(qPuNXeUOq|gvl@6$ zjAzk08+2iana$vyk*pgOPKWc>GxH*D;kNk2$HuE=jWQGt5(%+l1n{lFIeX-|RFIsP z+F^+)5^nsfCnSAkJl_MH7zq{AY;FULBy2+r8&eSo;hOZ>$!bA)*rI8pB$gpsD5>zg zT$a0nOa|1#x3J`(y+B&b#^8dd^>JOt;|zpah}Z$!OypY`ZMbhA*jeB0wD1qdwHXszoUN$eUOMXnT$wQxic6iC1+obGHIA0n{OYq#g))Wvce1_`G`0G6t?kgOY`kYLHvwL#-Rc z0xzp3bAiN)*Kd)~N95(4QWgH})i?F`+Ah}6ER7$n-@B~ia0Ah^aD$c-KiY|3p5}_5 zeywA1JGSF)1ul~E`0AOXudW$hG6`ulu*Z(>f(LSELm`!64r5 zYsaIA@)jY2O1$$j!*7}ixSk(hxX5W{AP>E#!J=%M(7!k@di1%iOHB+ zzZXS2p$Wy)B0|ls)NW*}s=q#h)N%Tv3qD5JQNl~jEEiMcW{eE~$WWfw*5I%8e(qM+ z(g}mg2T4%p->l|=QZ zs9t?@wxlIE;GHWfgeZUyd$a;5J5o}6QRklxh=x3eBrW>wD^xaq zd9OOwNKkWO-apw_HJT%oJVYHSZbanSZ>Md)$$8T2f|F<@T&Z|$U;w6L>ac7V^+bz7 z&>J80+|8zUu2G`=umR}FzK@txVDKR6>NqEux6`(}pmWxO!;9zy>(i_4x)1Y;YG(BP zFJ+SY@&zb+PG1dtgXejACUvsQ@s~6#Lj**DE8TM4@8H%Ckq&nUhf}i@bp7Mtx!-;~ z~SjDSmB7lW0FtZl-AjXIzwW%Dl#x-CC&Sq1n zCK>~rW@qt*4D?DJ*!Dx=xCc$)9$HFZv$#Hf5i!vFk^qmf1N2zLD6#}1kt3Nd%~`3j zYLoof1NxTHM%|OK^+auCykoD3#YmDVYx>qwetiOES6kCRe_M53j1m_fJ@5G#ZGpRv zq(J+YD{Ra{Q;eioe`Jd*pl`20^pdRy+IjSO+V@3k<$ASCvZwIPnNLI3j_}~B{AZet zFRPAS;0@Nx%pvoXgM(L3UbRdYcMk8-4}ClFfYlxfH6GqpiQw(G>U#0bIO$~7@F0sIOg~aGw z4|CmbB3U~v){+`U@GT|`Kq7Ihos`)$lrgnIADPu%`==}`T5d2nlgzT)Q;%Oh>F2G( z;0k+h+lG(MW!2oBucI53)X%ZWQWAqvxcSXOGXvQVHB-P6QFRYI<({C{fx#t^>q}x` zS>L(2rvgu`sv?v;pYe478w@W9G(^$FKcZveTPV{+XUkT>b9`(MJBuv1;Ou3zP_^~D``w9j_c z*KDGK-_#{IA4{&$zSTGeb9HY7hc{f3OSbqigR zj~>X=Sf>OMj}`NkS4oD;%jq?xO<&b8Hh#8~jw{z>)1AoAr&cIWD0eAf!XV>o?u;?J z_*e-Gj;2<``UYNNHQ`>~o3HeqL~%Z6^>i-4GCA+DD*vc`P-kZzrQY@9^sTZ4$CfWn z%s55kOX7&jsk{s9`V4}BL29EML6^HlTF~X3piUNMHnb6*=CPM)U9U&sD%;Vl=DIi) z@J01LlX)JCG}@!c0&s{G zwGIlbzw_I-(WFLUZM-EH0kJ0`Vg!eQ3}^j_jDnHrgJgIpYNh5f#0gqYmZd%+G={KH z66#f#KXW&Y&?_HW_OcHpVq#8}>H}}?Z96;MM>OHGguWqNY{FExB>cR|%{_9HxnB7w zPFJL9x}{K%yj!#wg153`Mh2bNXHfz2rZL|d@t&jQq6~18!IAK00$IXZ75(`c6+v_e z(uVYDO;sjxyIN*l{M(a~igOHN3|Bda>>1ScT%mrZS^6S!3^ z-XE}99G%rOK4`T@UJ8h|xd?50QqoGA+#?qW<6#vw1EMCX?xZe1YYi0cK+4`77<~+r zj`C4F)1F#hkfxAlV>8gdvg2^O(cZ2Wz~G-T<(T3A;7y+B5x1C9=4Q9{Np!J&we;?4 z_(3+NX%GqTV0ye8P?M7cZw)`~`LE!4y}7Qd6H#vyZUUdh~2D*9sM>A5j9f=sp#kt=EZ^nMG3)Ds-0 zRx3?P^4m3gh@?~k^Np%Fn1ao+uQi%c-)e|_{5cEZ)oh#u-L5^QMidmGx&ti`5pmvb z1nV0{o(UjbNO*Iy^y}wC+mn5lB+aJtJiIRojsnhi?)ze*^Wnf7)_CxJeQ4ylp-Ijn#_x0vQ_~(VPqXWGMj+Oh}vfHr!>z1#Q z&Fl_6q|gVkcKYNai8N%bBQyV#odXS!@AbDkn;DYpiGDQXU(;h=z1Kk{*td0QIhP09 z(Sm`61G#9|OJAWKK4&)7x%a&-@2G)-$M4w*U&b%`Z%weWFia@QL1**38vU%cp6!iv zVc5pR`KX-6Wu_gqOcMpT3g&B+P$rp9l^PH9y>9Av0b_;DS&yu4ne@gmxI0^x_vbOb zV^?zMLU~g)oJ0)Pk;sO?gg6I0S?c?n-r#Rx{u~m^GKq5=2C00~7bBT7NFK`hI!ogn zq6sn%&ij^4Xb9$kL;UnL+xsYCG)^}dB4c7g$%Y_o^byP(iQQ~zC1`K-s*?E|d{sEH zEoD?Qogge@^E!oo+W1*B|AQ|gLWL9lm>1;Xp#H!Sfl5A77+SClSOU9kVopX4bSv3Y zsR#7Mcj&DnU5f9wRS$_j_Hs&214CDABVlI@|TI?lqQTIUVM%I@d6cb${PHRKe0l78`4yr&O)lQlurq;}{lc68hX zK5w$iKe3nGM5=s^q7trQd?UU43>r_U?{mRzJ4U&JSADyD?@xiw@vQL5#0*_IIzb|y zrD-=ltWC!*RjJ!d`F(ketu7^_xKlVO1gw;P)b>4iaPz={0@Fj)-lUrN^9MTI?NuT< zs8rf#i~^ERnXrZ^w;#p!rcoZHr}J&gC>~gt9%S{ix@jhYi8GEJf~K_&9~4&tj3)iH zdXjrtGcW{*fW$;0VhQQOu(Llf@Xgfcw8HL#HkR^L{tvOB3=Ot*PM>7Nl)D`nO=I&n zIaT*n{=EV@69f6Xzvk|SXMF2D{7_UaM;-`9gyP(GjIJIae&MHTeY5n2iA{AH|eP%wo?6GG7}B;TMB06|$2Kyu4o8EjbZ34&{9&L&X1XCCXVx8^#OYF? z!naoF;Ifu4h<&Hi+{vNYT^qpnoB`Px_Qo*-h7B`5{~}k%Xu2*RdA|lv8^5?fY-%U1 zhTSzp1uH60#gtIuR&(hF(Gw24fC+e#K4}#+*GAS~q6k3yAU$4q1{B}j?S8hhI=p&a zoP(7>)HZva>^Pyf?pU! zz*L`?p3_8C_AMBKF;G0(ubpADCv!kK^7a|`sWFAgU7MmRLQb(Cn8Qu5wQexwgmW^@ zFWs85W5N|*-_K6&xs_h?i*2KOJa=^*)lUjYk!7{ZM}TK9l3GQPHwM16zw-Qeo`brP z@Fm>5qbAiim={@+>4u65tiEsK@o}rv+`-&5nI~#$HCKRZr_JBOeAkhtNqWn%a$=F~ zY}E{Ll~>LFv6mCqGX@%^OdH3=*nFeU zk`(x{F|U~bN($q;&Am;+islNHh>Ov3`O0b*huy3}YMlZO<@KA{4wNE6a$1q-VM*T& zddn#YR!8V|TDrTTkQlVMxStmDakj!pH82IRcAhbvW_X;k_1JFQiU~@mm>n$d;Y4bS z6T5?$f>rZbKJRC~1Iz5dv{rb07L;-ibf-;s!zd{hS4(|;#W6ze0jc)ghGO|=^XRAA zy7C@1iC5s~Un-3GAwD*aeAQr?z_HO%P@mJDlIAz2Sk-FIbk@>=2>45vh>K8BAA)ml z$ExC$?W?$-^LuAR`1bFEgLzGOUGF$`I466zCYJsIU|J#MV~g&=7;@gqTG1|u z+v_NlK6a9btzeu6zr-Ould9OY8@84a3ocKKWe^9a1t=kW6tTsgsn`m>{1mY-%^iCt zXJTxYT*`mpF1Q>%(2yyZ+rWty|KcqCi*oVEsQ8QV@Hge+FQx@h#m>qdU<`O@q>M*W#or_fIQIjr4S@lSfpGQ- z+ylf5W{L@vhZ8aYVq!pf9su{F5iK~ef)~L3NJja??f}Yp!4$M%k31AHu}8~r?~YJ6 zZ5I&uj|Awhr63L9|6Bck_#ls>N7u$cB^eoMkUPW*An>SV`ZvSmarF0gr5-6A&W}!j zvXEDHV929UpgNp20{j>EgNI*O;Qz?|c#P@a+>fkePaW+AlF0Ek!8Xl!yr{$p2U{&f zMeH`k_nfTU75!1P zekMbM#!%{KKD99}Bu6``9Mr_}enmR*Wz~``h;x z;*k0A6wqK~ zoNySoZ*0VflYqYDH94b+c0^19Vf4oMxY}-X-v}3uV9fJP?)fxwL&^;|1ncp8Ln3b{ z@)|nPc_CpTE+-DBcIKHt)BBA0t5?ZpVS$I_vNYE{m-+!|*a%g{kotw&NGCgD1k?Q| znk~N~M=gK|Q8n!zIR38G&3ptH44v25?)*rvSJ`9r32gZUGWO*71UXJ}tzc)hU10cw~K>G25Vh@S16Zvx|ej@_zIufE3 z^J`>R3-0w0qIXYZ0i;Ah^|@q;Ar%%(rKldc_)Q7mM$x3u&676yKK&8WW+) z$#H9{CSrY|Ar9ZnPtMcJ6Dauj5=}F!RANSVhS?tT0$)wuA%ApoY!3m(E<&Ca%_*;+ zn=wJPSG$LQE_6$b8kz@w%NB#*V(-+T|z!ya{)<5+5Cd}otkhzJN`fIjc-E|`uCKr7d!bb8|f>peux43<0$HOO( zYk5E10CdUN99m|=w-}$o6s1`~9C`5*@dtLH_8Y2vp#AL-ZC>h!r zUMgDC3$O&!(>l=}FlEuL(6s>)hUBeq-NX!4SEZd9b7rI)Bmr=}j%r?wmov@D@5bdTJj~V$NhChU`Wq6BGCbaB|>Y%ovbu7eD zQCZ4$%Gt^>UtEb?f&-wIi z52q<7GUuk|PN%<2eVh_0xX{V?bdZ=h5TE;TjWk=5Ke97H}0bw4SnX8zlmN;ZfsVaq?MsDo`t^ z!d=Eq!4o-3HG;C!G8)M-L849~9gP}Ip-M2Fkk;b}i8Y$y2<5=yXkpJWBsb_YcB=i+ zaBJ%X&2nC`^&g0^B(&AGod%zkRitrz{oH@KXmV<@)>O9pYTV!1#(AJ&3G6wYZBJv_ z(bfL-AvClgbTDreG&R}DJy|_>f*ReA33js1@{|_!7Or?DvU5JS&}I13Z~lgqv{)rw zC3?8PyZ`L^?AdNi(~l-MlYP@7=cf&_4Q&wVYw^V(of{hfD-b9OPYf zzwJp5_os=)z{RS>2qXg$zQmjC2x_(%3Q0z(q-&!araO{NlU2(1DIhK&%+Dw=AMhGT z98d&TSnBtiMNPjZ?DZG`Zxw81Y?hGTvM)Eeh;>Qi32ofcZOM5@bF$AFWVT;l6ZA1; zGAIyCzTtku@#aW4FB5g_d@87@UZteb@sRV7FdGeu28}=Z0^HT0Z(>Yh5?6kuwwf)L zwwbN<{#2)vLKCIr(A&|L_lO5>S#A$7+ z@m_B-J-_-QVBa{JI7+ThP@vJVbm{dX^Sn@_-C^-ZH5wH!22zy@6*lEH=#F#yb<(JQ zlDYq4d>y1}B+qeNZ$xinZhvXHfr?q5a~w#sL|xP7js{j3icJws`Pr%o(tp_yU8Pc^ zVnlATU<%Q!um9oip?skjqp#*$VSP}%__f8mrSj*={Ip()uD;QgbEU=6s87f37-~U8 zlz*zf@Ex^)yx?Wv=xuY8Qo)D1?>zw<0pEjPBl(ZUj*fDqb3iIw$_hPBExNmg*SrF! zSNa{xpd}3{)QFMvk<@`1*9R9pQjxxKRk3ycHrJ+3X!Qk!gg;DHg=dqd7!G3hsU@Cy z)in$hm2CMAA6?N*i|e_YY^3nn@D!`#hj^kM;)C7d9q)_IycUfYF}`C~0=> zb+0s+kXF(s$@bgiZsgm7XNT844;Qg0|AWc-{d8r9d1=S|@2Nw`5p4eamp*x=wOLB* z%Ru*aj(w-!EvcS_yiG^dM{JU*0ed%O=O>3HZ#8lPkgmeuPxAiRbqoKy4aNKy(OJ$J z&MCHYhBKWX+t&XX>DxI=IlJ5aEx-iI+F4se;Oyo{`m!;Ak00n{=j`DQ5EcNsdO%?i zM~F2HlbeqZXa$86vfbV706-fz&?|V43j%vQwz^lnH0qza-$Flol=WhBROz%H!|KE-HUsUfupa7`tVF~-g{C*U|(S|z6 z2_6z0DEv8>zjPEKj;|mvJ1__n`1iR09#8Z?nc;uH#Q%o_c+>xQ@c(eYAq{o3`hR%w za)Vf7asjw`FuDHy0SNH(^YR0%0sofqz=6%<2jKi~8Mgo&c>T9bNRS7PPW~Yi;(~AN zA2MNHI8XkcG66XB`EMB)5BLA3$HgQ3PZ>A2fB+nW{nuJF5b+P7(}F@_fJZ|ApI^lt26BV_IW2A;xJOI| K200CR%>M +// +// This file implements the Xsyr2k routine. The precision is implemented using a template argument. +// The implementation is very similar to Xsyrk (see header for details), except for the fact that +// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYR2K_H_ +#define CLBLAST_ROUTINES_XSYR2K_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsyr2k: public Routine { + public: + Xsyr2k(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYR2K_H_ +#endif diff --git a/src/clblast.cc b/src/clblast.cc index 13dfb50f..b8aa1e39 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -27,6 +27,7 @@ #include "internal/routines/xgemm.h" #include "internal/routines/xsymm.h" #include "internal/routines/xsyrk.h" +#include "internal/routines/xsyr2k.h" namespace clblast { // ================================================================================================= @@ -310,7 +311,6 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose t cl_command_queue* queue, cl_event* event) { auto queue_cpp = CommandQueue(*queue); auto event_cpp = Event(*event); - /* auto routine = Xsyr2k(queue_cpp, event_cpp); // Loads the kernel source-code as an include (C++11 raw string literal) @@ -333,8 +333,6 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose t Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); - */ - return StatusCode::kSuccess; } template StatusCode Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, diff --git a/src/routines/xsyr2k.cc b/src/routines/xsyr2k.cc new file mode 100644 index 00000000..a7aa6945 --- /dev/null +++ b/src/routines/xsyr2k.cc @@ -0,0 +1,169 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/xsyr2k.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyr2k::precision_ = Precision::kSingle; +template <> const Precision Xsyr2k::precision_ = Precision::kDouble; +template <> const Precision Xsyr2k::precision_ = Precision::kComplexSingle; +template <> const Precision Xsyr2k::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyr2k::Xsyr2k(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // In case of complex data-types, the transpose can also become a conjugate transpose + auto ab_conjugate = (ab_transpose == Transpose::kConjugate); + + // Computes the first and second dimensions of the A and B matrices taking the layout into account + auto ab_one = (ab_rotated) ? k : n; + auto ab_two = (ab_rotated) ? n : k; + + // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // fill them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + ab_rotated, ab_conjugate, true, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b, + ab_rotated, ab_conjugate, true, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_b()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Swaps the arguments for matrices A and B, and sets 'beta' to 1 + auto one = static_cast(1); + kernel.SetArgument(3, one); + kernel.SetArgument(4, temp_b()); + kernel.SetArgument(5, temp_a()); + + // Runs the kernel again + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/routines/xsyr2k.cc b/test/correctness/routines/xsyr2k.cc new file mode 100644 index 00000000..3365e6a8 --- /dev/null +++ b/test/correctness/routines/xsyr2k.cc @@ -0,0 +1,99 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xsyr2k routine. It is based on the TestABC class. +// +// ================================================================================================= + +#include "wrapper_clblas.h" +#include "correctness/testabc.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison. +template +void Xsyr2kTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates the CLBlast lambda + auto clblast_lambda = [](const Arguments &args, + const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat, + CommandQueue &queue) -> StatusCode { + auto queue_plain = queue(); + auto event = cl_event{}; + return Syr2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + b_mat(), args.b_offset, args.b_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + }; + + // Creates the clBLAS lambda (for comparison) + auto clblas_lambda = [](const Arguments &args, + const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat, + CommandQueue &queue) -> StatusCode { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2k(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + b_mat(), args.b_offset, args.b_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + return static_cast(status); + }; + + // Initializes the arguments relevant for this routine + auto args = Arguments{}; + const auto options = std::vector{kArgN, kArgK, kArgLayout, + kArgTriangle, kArgATransp, + kArgALeadDim, kArgBLeadDim, kArgCLeadDim, + kArgAOffset, kArgBOffset, kArgCOffset}; + + // Creates a tester + TestABC tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { + args.layout = layout; + for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) { + args.triangle = triangle; + for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is + args.a_transpose = ab_transpose; // not supported by clBLAS + args.b_transpose = ab_transpose; + const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose); + + // Runs the tests + tester.TestRegular(args, case_name, true); + tester.TestInvalidBufferSizes(args, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::Xsyr2kTest(argc, argv, false, "SSYR2K"); + clblast::Xsyr2kTest(argc, argv, true, "DSYR2K"); + clblast::Xsyr2kTest(argc, argv, true, "CSYR2K"); + clblast::Xsyr2kTest(argc, argv, true, "ZSYR2K"); + return 0; +} + +// ================================================================================================= diff --git a/test/performance/graphs/xsyr2k.r b/test/performance/graphs/xsyr2k.r new file mode 100644 index 00000000..0c6c0de2 --- /dev/null +++ b/test/performance/graphs/xsyr2k.r @@ -0,0 +1,94 @@ + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project uses a tab-size of two spaces and a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# This file implements the performance script for the Xsyr2k routine +# +# ================================================================================================== + +# Includes the common functions +args <- commandArgs(trailingOnly = FALSE) +thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)]))) +source(file.path(dirname(thisfile), "common.r")) + +# ================================================================================================== + +# Settings +routine_name <- "xsyr2k" +parameters <- c("-n","-k","-layout","-triangle","-transA", + "-num_steps","-step","-runs","-precision") +precision <- 32 + +# Sets the names of the test-cases +test_names <- list( + "multiples of 128", + "multiples of 128 (+1)", + "around n=k=512", + "around n=k=1536", + "layouts and transposing (n=k=1024)", + "powers of 2" +) + +# Defines the test-cases +test_values <- list( + list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)), + list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)), + list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)), + list(c(1536, 1536, 0, 0, 0, 16, 1, num_runs, precision)), + list( + c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), + c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), + c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), + c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + ), + list( + c(8, 8, 0, 0, 0, 1, 0, num_runs, precision), + c(16, 16, 0, 0, 0, 1, 0, num_runs, precision), + c(32, 32, 0, 0, 0, 1, 0, num_runs, precision), + c(64, 64, 0, 0, 0, 1, 0, num_runs, precision), + c(128, 128, 0, 0, 0, 1, 0, num_runs, precision), + c(256, 256, 0, 0, 0, 1, 0, num_runs, precision), + c(512, 512, 0, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), + c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision), + c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision), + c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision) + ) +) + +# Defines the x-labels corresponding to the test-cases +test_xlabels <- list( + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "layout (row/col), triangle (u/l), transA (n/y)", + "matrix sizes (n=k)" +) + +# Defines the x-axis of the test-cases +test_xaxis <- list( + c("n", ""), + c("n", ""), + c("n", ""), + c("n", ""), + list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y", + "col,u,n", "col,u,y", "col,l,n", "col,l,y")), + c("n", "x") +) + +# ================================================================================================== + +# Start the script +main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values, + test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE) + +# ================================================================================================== \ No newline at end of file diff --git a/test/performance/routines/xsyr2k.cc b/test/performance/routines/xsyr2k.cc new file mode 100644 index 00000000..8d9871d0 --- /dev/null +++ b/test/performance/routines/xsyr2k.cc @@ -0,0 +1,115 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k command-line interface tester. +// +// ================================================================================================= + +#include +#include +#include + +#include "wrapper_clblas.h" +#include "performance/client.h" + +namespace clblast { +// ================================================================================================= + +// The client, used for performance testing. It contains the function calls to CLBlast and to other +// libraries to compare against. +template +void PerformanceXsyr2k(const Arguments &args, + const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat, + CommandQueue &queue) { + + // Creates the CLBlast lambda + auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syr2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + b_mat(), args.b_offset, args.b_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + if (status != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast error: "+ToString(static_cast(status))); + } + }; + + // Creates the clBLAS lambda (for comparison) + auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2k(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + b_mat(), args.b_offset, args.b_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + if (status != CL_SUCCESS) { + throw std::runtime_error("clBLAS error: "+ToString(static_cast(status))); + } + }; + + // Runs the routines and collect the timings + auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda); + auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda); + + // Prints the performance of both libraries + const auto flops = 2 * args.n * args.n * args.k; + const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T); + const auto output_ints = std::vector{args.n, args.k, + static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.a_ld, args.b_ld, args.c_ld, + args.a_offset, args.b_offset, args.c_offset}; + const auto output_strings = std::vector{ToString(args.alpha), + ToString(args.beta)}; + PrintTableRow(output_ints, output_strings, args.no_abbrv, + ms_clblast, ms_clblas, flops, bytes); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void ClientXsyr2k(int argc, char *argv[]) { + const auto o = std::vector{kArgN, kArgK, + kArgLayout, kArgTriangle, kArgATransp, + kArgALeadDim, kArgBLeadDim, kArgCLeadDim, + kArgAOffset, kArgBOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: ClientABC(argc, argv, PerformanceXsyr2k, o, true); break; + case Precision::kDouble: ClientABC(argc, argv, PerformanceXsyr2k, o, true); break; + case Precision::kComplexSingle: ClientABC(argc, argv, PerformanceXsyr2k, o, true); break; + case Precision::kComplexDouble: ClientABC(argc, argv, PerformanceXsyr2k, o, true); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::ClientXsyr2k(argc, argv); + return 0; +} + +// ================================================================================================= From e27e339ebfa7eb7b20c442d431dd41db17f30583 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Fri, 26 Jun 2015 17:43:17 +0200 Subject: [PATCH 15/15] Replaced crosses with tickmarks --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 29add625..8ed05286 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ These graphs can be generated automatically on your own device. First, compile C Supported routines ------------- -CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with `x` in the following tables: +CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables: | Level-1 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| @@ -135,7 +135,7 @@ CLBlast is in active development and currently does not support the full set of | xSWAP | | | | | | | xSCAL | | | | | +CS +ZD | | xCOPY | | | | | | -| xAXPY |`x`|`x`|`x`|`x`| | +| xAXPY | ✔ | ✔ | ✔ | ✔ | | | xDOT | | | - | - | +DS | | xDOTU | - | - | | | | | xDOTC | - | - | | | | @@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of | Level-2 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| -| xGEMV |`x`|`x`|`x`|`x`| | +| xGEMV | ✔ | ✔ | ✔ | ✔ | | | xGBMV | | | | | | | xHEMV | - | - | | | | | xHBMV | - | - | | | | @@ -175,12 +175,12 @@ CLBlast is in active development and currently does not support the full set of | Level-3 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| -| xGEMM |`x`|`x`|`x`|`x`| | -| xSYMM |`x`|`x`|`x`|`x`| | +| xGEMM | ✔ | ✔ | ✔ | ✔ | | +| xSYMM | ✔ | ✔ | ✔ | ✔ | | | xHEMM | - | - | | | | -| xSYRK |`x`|`x`|`x`|`x`| | +| xSYRK | ✔ | ✔ | ✔ | ✔ | | | xHERK | - | - | | | | -| xSYR2K |`x`|`x`|`x`|`x`| | +| xSYR2K | ✔ | ✔ | ✔ | ✔ | | | xHER2K | - | - | | | | | xTRMM | | | | | | | xTRSM | | | | | |