diff --git a/CHANGELOG b/CHANGELOG index 12d9322e..3f9cb377 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,9 @@ +Development version (next release) +- Added level-3 routines: + * SSYRK/DSYRK/CSYRK/ZSYRK + * SSYR2K/DSYR2K/CSYR2K/ZSYR2K + Version 0.2.0 - Added support for complex conjugate transpose - Several host-code performance improvements diff --git a/CMakeLists.txt b/CMakeLists.txt index 038e71ae..9a1ec976 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,8 +97,10 @@ set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm) set(SAMPLE_PROGRAMS sgemm) set(ROUTINES_XY xaxpy) set(ROUTINES_AXY xgemv) -set(ROUTINES_ABC xgemm xsymm) -set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC}) +set(ROUTINES_ABC xgemm xsymm xsyr2k) +set(ROUTINES_AB ) +set(ROUTINES_AC xsyrk) +set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC} ${ROUTINES_AB} ${ROUTINES_AC}) # ================================================================================================== @@ -172,6 +174,8 @@ if(TESTS) add_library(test_correctness_xy OBJECT test/correctness/testxy.cc) add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc) add_library(test_correctness_abc OBJECT test/correctness/testabc.cc) + add_library(test_correctness_ab OBJECT test/correctness/testab.cc) + add_library(test_correctness_ac OBJECT test/correctness/testac.cc) # Compiles the correctness-tests foreach(ROUTINE ${ROUTINES_XY}) @@ -179,22 +183,32 @@ if(TESTS) $ $ test/correctness/routines/${ROUTINE}.cc) - target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES}) - install(TARGETS test_${ROUTINE} DESTINATION bin) endforeach() foreach(ROUTINE ${ROUTINES_AXY}) add_executable(test_${ROUTINE} $ $ test/correctness/routines/${ROUTINE}.cc) - target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES}) - install(TARGETS test_${ROUTINE} DESTINATION bin) endforeach() foreach(ROUTINE ${ROUTINES_ABC}) add_executable(test_${ROUTINE} $ $ test/correctness/routines/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${ROUTINES_AB}) + add_executable(test_${ROUTINE} + $ + $ + test/correctness/routines/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${ROUTINES_AC}) + add_executable(test_${ROUTINE} + $ + $ + test/correctness/routines/${ROUTINE}.cc) + endforeach() + foreach(ROUTINE ${ROUTINES}) target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES}) install(TARGETS test_${ROUTINE} DESTINATION bin) endforeach() diff --git a/README.md b/README.md index 1bed1146..8ed05286 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ These graphs can be generated automatically on your own device. First, compile C Supported routines ------------- -CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with `x` in the following tables: +CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables: | Level-1 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| @@ -135,7 +135,7 @@ CLBlast is in active development and currently does not support the full set of | xSWAP | | | | | | | xSCAL | | | | | +CS +ZD | | xCOPY | | | | | | -| xAXPY |`x`|`x`|`x`|`x`| | +| xAXPY | ✔ | ✔ | ✔ | ✔ | | | xDOT | | | - | - | +DS | | xDOTU | - | - | | | | | xDOTC | - | - | | | | @@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of | Level-2 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| -| xGEMV |`x`|`x`|`x`|`x`| | +| xGEMV | ✔ | ✔ | ✔ | ✔ | | | xGBMV | | | | | | | xHEMV | - | - | | | | | xHBMV | - | - | | | | @@ -175,12 +175,12 @@ CLBlast is in active development and currently does not support the full set of | Level-3 | S | D | C | Z | Notes | | ---------|---|---|---|---|---------| -| xGEMM |`x`|`x`|`x`|`x`| | -| xSYMM |`x`|`x`|`x`|`x`| | +| xGEMM | ✔ | ✔ | ✔ | ✔ | | +| xSYMM | ✔ | ✔ | ✔ | ✔ | | | xHEMM | - | - | | | | -| xSYRK | | | | | | +| xSYRK | ✔ | ✔ | ✔ | ✔ | | | xHERK | - | - | | | | -| xSYR2K | | | | | | +| xSYR2K | ✔ | ✔ | ✔ | ✔ | | | xHER2K | - | - | | | | | xTRMM | | | | | | | xTRSM | | | | | | diff --git a/doc/performance/Iris/SSYR2K.pdf b/doc/performance/Iris/SSYR2K.pdf new file mode 100644 index 00000000..a8e82a4e Binary files /dev/null and b/doc/performance/Iris/SSYR2K.pdf differ diff --git a/doc/performance/Iris/SSYRK.pdf b/doc/performance/Iris/SSYRK.pdf new file mode 100644 index 00000000..26cba385 Binary files /dev/null and b/doc/performance/Iris/SSYRK.pdf differ diff --git a/doc/performance/Tesla_K40m/SSYRK.pdf b/doc/performance/Tesla_K40m/SSYRK.pdf new file mode 100644 index 00000000..ada28248 Binary files /dev/null and b/doc/performance/Tesla_K40m/SSYRK.pdf differ diff --git a/include/clblast.h b/include/clblast.h index 231348b8..da504a0b 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -107,7 +107,7 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a, // ================================================================================================= // BLAS level-3 (matrix-matrix) routines -// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM +// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM template StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b, const size_t m, const size_t n, const size_t k, @@ -118,7 +118,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); -// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM +// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, @@ -129,6 +129,27 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event); +// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose transpose_a, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); + +// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose transpose_ab, + const size_t n, const size_t k, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= } // namespace clblast diff --git a/include/internal/routine.h b/include/internal/routine.h index a65ced20..d2cee52c 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -92,7 +92,8 @@ class Routine { const size_t dest_ld, const size_t dest_offset, const Buffer &dest, const bool do_transpose, const bool do_conjugate, - const bool pad, const Program &program); + const bool pad, const bool upper, const bool lower, + const Program &program); // Queries the cache and retrieve either a matching program or a boolean whether a match exists. // The first assumes that the program is available in the cache and will throw an exception diff --git a/include/internal/routines/xsyr2k.h b/include/internal/routines/xsyr2k.h new file mode 100644 index 00000000..6259313c --- /dev/null +++ b/include/internal/routines/xsyr2k.h @@ -0,0 +1,48 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k routine. The precision is implemented using a template argument. +// The implementation is very similar to Xsyrk (see header for details), except for the fact that +// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYR2K_H_ +#define CLBLAST_ROUTINES_XSYR2K_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsyr2k: public Routine { + public: + Xsyr2k(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYR2K_H_ +#endif diff --git a/include/internal/routines/xsyrk.h b/include/internal/routines/xsyrk.h new file mode 100644 index 00000000..3dab731f --- /dev/null +++ b/include/internal/routines/xsyrk.h @@ -0,0 +1,49 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk routine. The precision is implemented using a template argument. +// The implementation is based on the regular Xgemm routine and kernel, but with two main changes: +// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part. +// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for +// performance reasons, as the actual masking is done later (see the first point). +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XSYRK_H_ +#define CLBLAST_ROUTINES_XSYRK_H_ + +#include "internal/routine.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xsyrk: public Routine { + public: + Xsyrk(CommandQueue &queue, Event &event); + + // Templated-precision implementation of the routine + StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + private: + // Static variable to get the precision + const static Precision precision_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XSYRK_H_ +#endif diff --git a/src/clblast.cc b/src/clblast.cc index bb0091a3..b8aa1e39 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -26,6 +26,8 @@ // BLAS level-3 includes #include "internal/routines/xgemm.h" #include "internal/routines/xsymm.h" +#include "internal/routines/xsyrk.h" +#include "internal/routines/xsyr2k.h" namespace clblast { // ================================================================================================= @@ -209,7 +211,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, std::string kernel_source = #include "kernels/xgemm.opencl" auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 + - kernel_source); + kernel_source); if (status != StatusCode::kSuccess) { return status; } // Runs the routine @@ -243,5 +245,119 @@ template StatusCode Symm(const Layout, const Side, const Triangle, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +// ================================================================================================= + +// SYRK +template +StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose transpose_a, + const size_t n, const size_t k, const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = CommandQueue(*queue); + auto event_cpp = Event(*event); + auto routine = Xsyrk(queue_cpp, event_cpp); + + // Loads the kernel source-code as an include (C++11 raw string literal) + std::string common_source1 = + #include "kernels/copy.opencl" + std::string common_source2 = + #include "kernels/pad.opencl" + std::string common_source3 = + #include "kernels/transpose.opencl" + std::string common_source4 = + #include "kernels/padtranspose.opencl" + std::string kernel_source = + #include "kernels/xgemm.opencl" + auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 + + kernel_source); + if (status != StatusCode::kSuccess) { return status; } + + // Runs the routine + return routine.DoSyrk(layout, triangle, transpose_a, n, k, alpha, + Buffer(a_buffer), a_offset, a_ld, beta, + Buffer(c_buffer), c_offset, c_ld); +} +template StatusCode Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syrk(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + +// ================================================================================================= + +// SYR2K +template +StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose transpose_ab, + const size_t n, const size_t k, const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, + cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + cl_command_queue* queue, cl_event* event) { + auto queue_cpp = CommandQueue(*queue); + auto event_cpp = Event(*event); + auto routine = Xsyr2k(queue_cpp, event_cpp); + + // Loads the kernel source-code as an include (C++11 raw string literal) + std::string common_source1 = + #include "kernels/copy.opencl" + std::string common_source2 = + #include "kernels/pad.opencl" + std::string common_source3 = + #include "kernels/transpose.opencl" + std::string common_source4 = + #include "kernels/padtranspose.opencl" + std::string kernel_source = + #include "kernels/xgemm.opencl" + auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 + + kernel_source); + if (status != StatusCode::kSuccess) { return status; } + + // Runs the routine + return routine.DoSyr2k(layout, triangle, transpose_ab, n, k, alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld, beta, + Buffer(c_buffer), c_offset, c_ld); +} +template StatusCode Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const float, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, const float, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const double, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, const double, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const float2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, const float2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); +template StatusCode Syr2k(const Layout, const Triangle, const Transpose, + const size_t, const size_t, const double2, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, const double2, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); + // ================================================================================================= } // namespace clblast diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl index 45eaef91..cce0c746 100644 --- a/src/kernels/pad.opencl +++ b/src/kernels/pad.opencl @@ -86,7 +86,8 @@ __kernel void UnPadMatrix(const int src_one, const int src_two, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, - __global real* dest) { + __global real* dest, + const int upper, const int lower) { // Loops over the work per thread in both dimensions #pragma unroll @@ -95,11 +96,18 @@ __kernel void UnPadMatrix(const int src_one, const int src_two, #pragma unroll for (int w_two=0; w_two= id_one); } + else if (lower == 1) { condition = (id_two <= id_one); } + if (condition) { // Copies the value into the destination matrix. This is always within bounds of the source // matrix, as we know that the destination matrix is smaller than the source. - dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset]; + if (id_two < dest_two && id_one < dest_one) { + dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset]; + } } } } diff --git a/src/kernels/padtranspose.opencl b/src/kernels/padtranspose.opencl index 2f2aabd6..7e923392 100644 --- a/src/kernels/padtranspose.opencl +++ b/src/kernels/padtranspose.opencl @@ -100,7 +100,8 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, - __global real* dest) { + __global real* dest, + const int upper, const int lower) { // Local memory to store a tile of the matrix (for coalescing) __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; @@ -137,10 +138,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two, const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0); const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1); - // Stores the transposed value in the destination matrix - if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { - real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; - dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + // Masking in case of triangular matrices: updates only the upper or lower part + bool condition = true; + if (upper == 1) { condition = (id_dest_one >= id_dest_two); } + else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } + if (condition) { + + // Stores the transposed value in the destination matrix + if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { + real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; + dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + } } } } diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl index a4f45e90..4c7ae064 100644 --- a/src/kernels/xgemm.opencl +++ b/src/kernels/xgemm.opencl @@ -127,6 +127,55 @@ R"( // ================================================================================================= +// Initializes the accumulation registers to zero +inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { + #pragma unroll + for (int mi=0; mi get_group_id(0)*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeN, alpha, beta); +} + +// ================================================================================================= + // End of the C++11 raw string literal )"; diff --git a/src/routine.cc b/src/routine.cc index a4e0bb37..4b7ece41 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -210,11 +210,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr const size_t dest_ld, const size_t dest_offset, const Buffer &dest, const bool do_transpose, const bool do_conjugate, - const bool pad, const Program &program) { + const bool pad, const bool upper, const bool lower, + const Program &program) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && - (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld); + (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && + (upper == false) && (lower == false); // Determines the right kernel auto kernel_name = std::string{}; @@ -267,6 +269,10 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr if (pad) { kernel.SetArgument(10, static_cast(do_conjugate)); } + else { + kernel.SetArgument(10, static_cast(upper)); + kernel.SetArgument(11, static_cast(lower)); + } } // Launches the kernel and returns the error code. Uses global and local thread sizes based on diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc index 20cd2675..651ebb55 100644 --- a/src/routines/xgemm.cc +++ b/src/routines/xgemm.cc @@ -108,18 +108,18 @@ StatusCode Xgemm::DoGemm(const Layout layout, // them up until they reach a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, m_ceiled, k_ceiled, m_ceiled, 0, temp_a, - a_do_transpose, a_conjugate, true, program); + a_do_transpose, a_conjugate, true, false, false, program); if (ErrorIn(status)) { return status; } status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - b_do_transpose, b_conjugate, true, program); + b_do_transpose, b_conjugate, true, false, false, program); if (ErrorIn(status)) { return status; } // Only necessary for matrix C if it used both as input and output if (beta != static_cast(0)) { status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, m_ceiled, n_ceiled, m_ceiled, 0, temp_c, - c_do_transpose, false, true, program); + c_do_transpose, false, true, false, false, program); if (ErrorIn(status)) { return status; } } @@ -151,7 +151,7 @@ StatusCode Xgemm::DoGemm(const Layout layout, // Runs the post-processing kernel status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c, c_one, c_two, c_ld, c_offset, c_buffer, - c_do_transpose, false, false, program); + c_do_transpose, false, false, false, false, program); if (ErrorIn(status)) { return status; } // Successfully finished the computation diff --git a/src/routines/xsyr2k.cc b/src/routines/xsyr2k.cc new file mode 100644 index 00000000..a7aa6945 --- /dev/null +++ b/src/routines/xsyr2k.cc @@ -0,0 +1,169 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/xsyr2k.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyr2k::precision_ = Precision::kSingle; +template <> const Precision Xsyr2k::precision_ = Precision::kDouble; +template <> const Precision Xsyr2k::precision_ = Precision::kComplexSingle; +template <> const Precision Xsyr2k::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyr2k::Xsyr2k(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // In case of complex data-types, the transpose can also become a conjugate transpose + auto ab_conjugate = (ab_transpose == Transpose::kConjugate); + + // Computes the first and second dimensions of the A and B matrices taking the layout into account + auto ab_one = (ab_rotated) ? k : n; + auto ab_two = (ab_rotated) ? n : k; + + // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix B cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to + // fill them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + ab_rotated, ab_conjugate, true, false, false, program); + if (ErrorIn(status)) { return status; } + status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_b, + ab_rotated, ab_conjugate, true, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_b()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Swaps the arguments for matrices A and B, and sets 'beta' to 1 + auto one = static_cast(1); + kernel.SetArgument(3, one); + kernel.SetArgument(4, temp_b()); + kernel.SetArgument(5, temp_a()); + + // Runs the kernel again + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; +template class Xsyr2k; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/xsyrk.cc b/src/routines/xsyrk.cc new file mode 100644 index 00000000..d8c150fd --- /dev/null +++ b/src/routines/xsyrk.cc @@ -0,0 +1,150 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk class (see the header for information about the class). +// +// ================================================================================================= + +#include "internal/routines/xsyrk.h" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Specific implementations to get the memory-type based on a template argument +template <> const Precision Xsyrk::precision_ = Precision::kSingle; +template <> const Precision Xsyrk::precision_ = Precision::kDouble; +template <> const Precision Xsyrk::precision_ = Precision::kComplexSingle; +template <> const Precision Xsyrk::precision_ = Precision::kComplexDouble; + +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xsyrk::Xsyrk(CommandQueue &queue, Event &event): + Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) { +} + +// ================================================================================================= + +// The main routine +template +StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + + // Makes sure all dimensions are larger than zero + if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } + + // Computes whether or not the matrices are transposed in memory. This is based on their layout + // (row or column-major) and whether or not they are requested to be pre-transposed. + auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + auto c_rotated = (layout == Layout::kRowMajor); + + // In case of complex data-types, the transpose can also become a conjugate transpose + auto a_conjugate = (a_transpose == Transpose::kConjugate); + + // Computes the first and second dimensions of the A matrix taking the layout into account + auto a_one = (a_rotated) ? k : n; + auto a_two = (a_rotated) ? n : k; + + // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and + // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the + // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage + // space. Also tests that the leading dimensions of: + // matrix A cannot be less than N when rotated, or less than K when not-rotated + // matrix C cannot be less than N + auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T)); + if (ErrorIn(status)) { return status; } + + // Calculates the ceiled versions of n and k + auto n_ceiled = Ceil(n, db_["NWG"]); + auto k_ceiled = Ceil(k, db_["KWG"]); + + // Decides which kernel to run: the upper-triangular or lower-triangular version + auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; + + // Allocates space on the device for padded and/or transposed input and output matrices. + try { + auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); + auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + + // Loads the program from the database + auto& program = GetProgramFromCache(); + + // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to + // fill them up until they reach a certain multiple of size (kernel parameter dependent). + status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, + n_ceiled, k_ceiled, n_ceiled, 0, temp_a, + a_rotated, a_conjugate, true, false, false, program); + if (ErrorIn(status)) { return status; } + + // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to + // modify the other triangle. + status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, + n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + c_rotated, false, true, false, false, program); + if (ErrorIn(status)) { return status; } + + // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary + try { + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n_ceiled)); + kernel.SetArgument(1, static_cast(k_ceiled)); + kernel.SetArgument(2, alpha); + kernel.SetArgument(3, beta); + kernel.SetArgument(4, temp_a()); + kernel.SetArgument(5, temp_a()); + kernel.SetArgument(6, temp_c()); + + // Computes the global and local thread sizes + auto global = std::vector{ + (n_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; + + // Launches the kernel + status = RunKernel(kernel, global, local); + if (ErrorIn(status)) { return status; } + + // Runs the post-processing kernel + auto upper = (triangle == Triangle::kUpper); + auto lower = (triangle == Triangle::kLower); + status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, + n, n, c_ld, c_offset, c_buffer, + c_rotated, false, false, upper, lower, program); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } + } catch (...) { return StatusCode::kTempBufferAllocFailure; } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; +template class Xsyrk; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/routines/xgemm.cc b/test/correctness/routines/xgemm.cc index 4129e17c..b829aaa3 100644 --- a/test/correctness/routines/xgemm.cc +++ b/test/correctness/routines/xgemm.cc @@ -76,7 +76,7 @@ void XgemmTest(int argc, char *argv[], const bool silent, const std::string &nam const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose); // Runs the tests - tester.TestRegular(args, case_name); + tester.TestRegular(args, case_name, false); tester.TestInvalidBufferSizes(args, case_name); } } diff --git a/test/correctness/routines/xsymm.cc b/test/correctness/routines/xsymm.cc index d769177f..0e02e9ca 100644 --- a/test/correctness/routines/xsymm.cc +++ b/test/correctness/routines/xsymm.cc @@ -76,7 +76,7 @@ void XsymmTest(int argc, char *argv[], const bool silent, const std::string &nam const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle); // Runs the tests - tester.TestRegular(args, case_name); + tester.TestRegular(args, case_name, true); tester.TestInvalidBufferSizes(args, case_name); } } diff --git a/test/correctness/routines/xsyr2k.cc b/test/correctness/routines/xsyr2k.cc new file mode 100644 index 00000000..3365e6a8 --- /dev/null +++ b/test/correctness/routines/xsyr2k.cc @@ -0,0 +1,99 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xsyr2k routine. It is based on the TestABC class. +// +// ================================================================================================= + +#include "wrapper_clblas.h" +#include "correctness/testabc.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison. +template +void Xsyr2kTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates the CLBlast lambda + auto clblast_lambda = [](const Arguments &args, + const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat, + CommandQueue &queue) -> StatusCode { + auto queue_plain = queue(); + auto event = cl_event{}; + return Syr2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + b_mat(), args.b_offset, args.b_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + }; + + // Creates the clBLAS lambda (for comparison) + auto clblas_lambda = [](const Arguments &args, + const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat, + CommandQueue &queue) -> StatusCode { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2k(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + b_mat(), args.b_offset, args.b_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + return static_cast(status); + }; + + // Initializes the arguments relevant for this routine + auto args = Arguments{}; + const auto options = std::vector{kArgN, kArgK, kArgLayout, + kArgTriangle, kArgATransp, + kArgALeadDim, kArgBLeadDim, kArgCLeadDim, + kArgAOffset, kArgBOffset, kArgCOffset}; + + // Creates a tester + TestABC tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { + args.layout = layout; + for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) { + args.triangle = triangle; + for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is + args.a_transpose = ab_transpose; // not supported by clBLAS + args.b_transpose = ab_transpose; + const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose); + + // Runs the tests + tester.TestRegular(args, case_name, true); + tester.TestInvalidBufferSizes(args, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::Xsyr2kTest(argc, argv, false, "SSYR2K"); + clblast::Xsyr2kTest(argc, argv, true, "DSYR2K"); + clblast::Xsyr2kTest(argc, argv, true, "CSYR2K"); + clblast::Xsyr2kTest(argc, argv, true, "ZSYR2K"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/routines/xsyrk.cc b/test/correctness/routines/xsyrk.cc new file mode 100644 index 00000000..8d3bd82e --- /dev/null +++ b/test/correctness/routines/xsyrk.cc @@ -0,0 +1,96 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the tests for the Xsyrk routine. It is based on the TestAC class. +// +// ================================================================================================= + +#include "wrapper_clblas.h" +#include "correctness/testac.h" + +namespace clblast { +// ================================================================================================= + +// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison. +template +void XsyrkTest(int argc, char *argv[], const bool silent, const std::string &name) { + + // Creates the CLBlast lambda + auto clblast_lambda = [](const Arguments &args, + const Buffer &a_mat, const Buffer &c_mat, + CommandQueue &queue) -> StatusCode { + auto queue_plain = queue(); + auto event = cl_event{}; + return Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + }; + + // Creates the clBLAS lambda (for comparison) + auto clblas_lambda = [](const Arguments &args, + const Buffer &a_mat, const Buffer &c_mat, + CommandQueue &queue) -> StatusCode { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyrk(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + return static_cast(status); + }; + + // Initializes the arguments relevant for this routine + auto args = Arguments{}; + const auto options = std::vector{kArgN, kArgK, kArgLayout, + kArgTriangle, kArgATransp, + kArgALeadDim, kArgCLeadDim, + kArgAOffset, kArgCOffset}; + + // Creates a tester + TestAC tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda}; + + // Loops over the test-cases from a data-layout point of view + for (auto &layout: tester.kLayouts) { + args.layout = layout; + for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) { + args.triangle = triangle; + for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is + args.a_transpose = a_transpose; // not supported by clBLAS + const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose); + + // Runs the tests + tester.TestRegular(args, case_name); + tester.TestInvalidBufferSizes(args, case_name); + } + } + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::XsyrkTest(argc, argv, false, "SSYRK"); + clblast::XsyrkTest(argc, argv, true, "DSYRK"); + clblast::XsyrkTest(argc, argv, true, "CSYRK"); + clblast::XsyrkTest(argc, argv, true, "ZSYRK"); + return 0; +} + +// ================================================================================================= diff --git a/test/correctness/testab.cc b/test/correctness/testab.cc new file mode 100644 index 00000000..ef03f32f --- /dev/null +++ b/test/correctness/testab.cc @@ -0,0 +1,192 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the TestAB class (see the header for information about the class). +// +// ================================================================================================= + +#include + +#include "correctness/testab.h" + +namespace clblast { +// ================================================================================================= + +// Constructor, initializes the base class tester and input data +template +TestAB::TestAB(int argc, char *argv[], const bool silent, + const std::string &name, const std::vector &options, + const Routine clblast_lambda, const Routine clblas_lambda): + Tester{argc, argv, silent, name, options}, + clblast_lambda_(clblast_lambda), + clblas_lambda_(clblas_lambda) { + + // Computes the maximum sizes. This allows for a single set of input/output buffers. + auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); + auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); + auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end()); + + // Creates test input data + a_source_.resize(max_dim*max_ld + max_offset); + b_source_.resize(max_dim*max_ld + max_offset); + PopulateVector(a_source_); + PopulateVector(b_source_); +} + +// =============================================================================================== + +// Tests the routine for a wide variety of parameters +template +void TestAB::TestRegular(Arguments &args, const std::string &name) { + if (!PrecisionSupported()) { return; } + TestStart("regular behaviour", name); + + // Computes whether or not the matrices are transposed. Note that we assume a default of + // column-major and no-transpose. If one of them is different (but not both), then rotated + // is considered true. + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + auto b_rotated = (args.layout == Layout::kRowMajor); + + // Iterates over the matrix dimensions + for (auto &m: kMatrixDims) { + args.m = m; + for (auto &n: kMatrixDims) { + args.n = n; + + // Computes the second dimensions of the matrices taking the rotation into account + auto a_two = (a_rotated) ? n : n; + auto b_two = (b_rotated) ? m : n; + + // Iterates over the leading-dimension values and the offsets + for (auto &a_ld: kMatrixDims) { + args.a_ld = a_ld; + for (auto &a_offset: kOffsets) { + args.a_offset = a_offset; + for (auto &b_ld: kMatrixDims) { + args.b_ld = b_ld; + for (auto &b_offset: kOffsets) { + args.b_offset = b_offset; + + // Computes the buffer sizes + auto a_size = a_two * a_ld + a_offset; + auto b_size = b_two * b_ld + b_offset; + if (a_size < 1 || b_size < 1) { continue; } + + // Creates the OpenCL buffers + auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T)); + auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T)); + auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T)); + + // Iterates over the values for alpha and beta + for (auto &alpha: kAlphaValues) { + args.alpha = alpha; + for (auto &beta: kBetaValues) { + args.beta = beta; + + // Runs the reference clBLAS code + a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_); + r_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_); + auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_); + + // Runs the CLBlast code + a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_); + s_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_); + auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_); + + // Tests for equality of the two status codes + if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) { + TestErrorCodes(status1, status2, args); + continue; + } + + // Downloads the results + std::vector r_result(b_size, static_cast(0)); + std::vector s_result(b_size, static_cast(0)); + r_mat.ReadBuffer(queue_, b_size*sizeof(T), r_result); + s_mat.ReadBuffer(queue_, b_size*sizeof(T), s_result); + + // Checks for differences in the output + auto errors = size_t{0}; + for (auto idm=size_t{0}; idm +void TestAB::TestInvalidBufferSizes(Arguments &args, const std::string &name) { + if (!PrecisionSupported()) { return; } + TestStart("invalid buffer sizes", name); + + // Sets example test parameters + args.m = kBufferSize; + args.n = kBufferSize; + args.a_ld = kBufferSize; + args.b_ld = kBufferSize; + args.a_offset = 0; + args.b_offset = 0; + + // Iterates over test buffer sizes + const std::vector kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize}; + for (auto &a_size: kBufferSizes) { + for (auto &b_size: kBufferSizes) { + + // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly + // want to be able to create invalid buffers (no error checking here). + auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr); + auto a_mat = Buffer(a); + auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr); + auto r_mat = Buffer(r); + auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr); + auto s_mat = Buffer(s); + + // Runs the two routines + auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_); + auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_); + + // Tests for equality of the two status codes + TestErrorCodes(status1, status2, args); + } + } + TestEnd(); +} + +// ================================================================================================= + +// Compiles the templated class +template class TestAB; +template class TestAB; +template class TestAB; +template class TestAB; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/testab.h b/test/correctness/testab.h new file mode 100644 index 00000000..24a9db7c --- /dev/null +++ b/test/correctness/testab.h @@ -0,0 +1,85 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file tests any mat-mat (A,B) routine. It contains two types of tests: one testing +// all sorts of input combinations, and one deliberatly testing with invalid values. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_CORRECTNESS_TESTAB_H_ +#define CLBLAST_TEST_CORRECTNESS_TESTAB_H_ + +#include +#include + +#include "correctness/tester.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestAB: public Tester { + public: + + // Uses several variables from the Tester class + using Tester::context_; + using Tester::queue_; + using Tester::kLayouts; + using Tester::kTransposes; + + // Uses several helper functions from the Tester class + using Tester::TestStart; + using Tester::TestEnd; + using Tester::TestSimilarity; + using Tester::TestErrorCount; + using Tester::TestErrorCodes; + using Tester::GetExampleScalars; + using Tester::GetOffsets; + using Tester::PrecisionSupported; + + // Test settings for the regular test. Append to this list in case more tests are required. + const std::vector kMatrixDims = { 7, 64 }; + const std::vector kOffsets = GetOffsets(); + const std::vector kAlphaValues = GetExampleScalars(); + const std::vector kBetaValues = GetExampleScalars(); + + // Test settings for the invalid test + const size_t kBufferSize = 64; + + // Shorthand for a BLAS routine + using Routine = std::function&, + const Buffer&, const Buffer&, + CommandQueue&)>; + + // Constructor, initializes the base class tester and input data + TestAB(int argc, char *argv[], const bool silent, + const std::string &name, const std::vector &options, + const Routine clblast_lambda, const Routine clblas_lambda); + + // The test functions, taking no inputs + void TestRegular(Arguments &args, const std::string &name); + void TestInvalidBufferSizes(Arguments &args, const std::string &name); + + private: + + // Source data to test with + std::vector a_source_; + std::vector b_source_; + + // The routines to test + Routine clblast_lambda_; + Routine clblas_lambda_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_CORRECTNESS_TESTAB_H_ +#endif diff --git a/test/correctness/testabc.cc b/test/correctness/testabc.cc index eed17560..64e02523 100644 --- a/test/correctness/testabc.cc +++ b/test/correctness/testabc.cc @@ -45,7 +45,7 @@ TestABC::TestABC(int argc, char *argv[], const bool silent, // Tests the routine for a wide variety of parameters template -void TestABC::TestRegular(Arguments &args, const std::string &name) { +void TestABC::TestRegular(Arguments &args, const std::string &name, const bool symmetric) { if (!PrecisionSupported()) { return; } TestStart("regular behaviour", name); @@ -63,6 +63,7 @@ void TestABC::TestRegular(Arguments &args, const std::string &name) { args.m = m; for (auto &n: kMatrixDims) { args.n = n; + if (symmetric && m != n) { continue; } for (auto &k: kMatrixDims) { args.k = k; diff --git a/test/correctness/testabc.h b/test/correctness/testabc.h index 2c44d532..41100db7 100644 --- a/test/correctness/testabc.h +++ b/test/correctness/testabc.h @@ -64,7 +64,7 @@ class TestABC: public Tester { const Routine clblast_lambda, const Routine clblas_lambda); // The test functions, taking no inputs - void TestRegular(Arguments &args, const std::string &name); + void TestRegular(Arguments &args, const std::string &name, const bool symmetric); void TestInvalidBufferSizes(Arguments &args, const std::string &name); private: diff --git a/test/correctness/testac.cc b/test/correctness/testac.cc new file mode 100644 index 00000000..e16186d9 --- /dev/null +++ b/test/correctness/testac.cc @@ -0,0 +1,191 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the TestAC class (see the header for information about the class). +// +// ================================================================================================= + +#include + +#include "correctness/testac.h" + +namespace clblast { +// ================================================================================================= + +// Constructor, initializes the base class tester and input data +template +TestAC::TestAC(int argc, char *argv[], const bool silent, + const std::string &name, const std::vector &options, + const Routine clblast_lambda, const Routine clblas_lambda): + Tester{argc, argv, silent, name, options}, + clblast_lambda_(clblast_lambda), + clblas_lambda_(clblas_lambda) { + + // Computes the maximum sizes. This allows for a single set of input/output buffers. + auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); + auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); + auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end()); + + // Creates test input data + a_source_.resize(max_dim*max_ld + max_offset); + c_source_.resize(max_dim*max_ld + max_offset); + PopulateVector(a_source_); + PopulateVector(c_source_); +} + +// =============================================================================================== + +// Tests the routine for a wide variety of parameters +template +void TestAC::TestRegular(Arguments &args, const std::string &name) { + if (!PrecisionSupported()) { return; } + TestStart("regular behaviour", name); + + // Computes whether or not the matrices are transposed. Note that we assume a default of + // column-major and no-transpose. If one of them is different (but not both), then rotated + // is considered true. + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + auto c_rotated = (args.layout == Layout::kRowMajor); + + // Iterates over the matrix dimensions + for (auto &n: kMatrixDims) { + args.n = n; + for (auto &k: kMatrixDims) { + args.k = k; + + // Computes the second dimensions of the matrices taking the rotation into account + auto a_two = (a_rotated) ? n : k; + auto c_two = (c_rotated) ? n : n; + + // Iterates over the leading-dimension values and the offsets + for (auto &a_ld: kMatrixDims) { + args.a_ld = a_ld; + for (auto &a_offset: kOffsets) { + args.a_offset = a_offset; + for (auto &c_ld: kMatrixDims) { + args.c_ld = c_ld; + for (auto &c_offset: kOffsets) { + args.c_offset = c_offset; + + // Computes the buffer sizes + auto a_size = a_two * a_ld + a_offset; + auto c_size = c_two * c_ld + c_offset; + if (a_size < 1 || c_size < 1) { continue; } + + // Creates the OpenCL buffers + auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T)); + auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T)); + auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T)); + + // Iterates over the values for alpha and beta + for (auto &alpha: kAlphaValues) { + args.alpha = alpha; + for (auto &beta: kBetaValues) { + args.beta = beta; + + // Runs the reference clBLAS code + a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_); + r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_); + auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_); + + // Runs the CLBlast code + a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_); + s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_); + auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_); + + // Tests for equality of the two status codes + if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) { + TestErrorCodes(status1, status2, args); + continue; + } + + // Downloads the results + std::vector r_result(c_size, static_cast(0)); + std::vector s_result(c_size, static_cast(0)); + r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result); + s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result); + + // Checks for differences in the output + auto errors = size_t{0}; + for (auto idn0=size_t{0}; idn0 +void TestAC::TestInvalidBufferSizes(Arguments &args, const std::string &name) { + if (!PrecisionSupported()) { return; } + TestStart("invalid buffer sizes", name); + + // Sets example test parameters + args.m = kBufferSize; + args.n = kBufferSize; + args.k = kBufferSize; + args.a_ld = kBufferSize; + args.c_ld = kBufferSize; + args.a_offset = 0; + args.c_offset = 0; + + // Iterates over test buffer sizes + const std::vector kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize}; + for (auto &a_size: kBufferSizes) { + for (auto &c_size: kBufferSizes) { + + // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly + // want to be able to create invalid buffers (no error checking here). + auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr); + auto a_mat = Buffer(a); + auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr); + auto r_mat = Buffer(r); + auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr); + auto s_mat = Buffer(s); + + // Runs the two routines + auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_); + auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_); + + // Tests for equality of the two status codes + TestErrorCodes(status1, status2, args); + } + } + TestEnd(); +} + +// ================================================================================================= + +// Compiles the templated class +template class TestAC; +template class TestAC; +template class TestAC; +template class TestAC; + +// ================================================================================================= +} // namespace clblast diff --git a/test/correctness/testac.h b/test/correctness/testac.h new file mode 100644 index 00000000..4ab21e23 --- /dev/null +++ b/test/correctness/testac.h @@ -0,0 +1,85 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under the MIT license. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file tests any mat-mat (A,C) routine. It contains two types of tests: one testing +// all sorts of input combinations, and one deliberatly testing with invalid values. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_CORRECTNESS_TESTAC_H_ +#define CLBLAST_TEST_CORRECTNESS_TESTAC_H_ + +#include +#include + +#include "correctness/tester.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestAC: public Tester { + public: + + // Uses several variables from the Tester class + using Tester::context_; + using Tester::queue_; + using Tester::kLayouts; + using Tester::kTransposes; + + // Uses several helper functions from the Tester class + using Tester::TestStart; + using Tester::TestEnd; + using Tester::TestSimilarity; + using Tester::TestErrorCount; + using Tester::TestErrorCodes; + using Tester::GetExampleScalars; + using Tester::GetOffsets; + using Tester::PrecisionSupported; + + // Test settings for the regular test. Append to this list in case more tests are required. + const std::vector kMatrixDims = { 7, 64 }; + const std::vector kOffsets = GetOffsets(); + const std::vector kAlphaValues = GetExampleScalars(); + const std::vector kBetaValues = GetExampleScalars(); + + // Test settings for the invalid test + const size_t kBufferSize = 64; + + // Shorthand for a BLAS routine + using Routine = std::function&, + const Buffer&, const Buffer&, + CommandQueue&)>; + + // Constructor, initializes the base class tester and input data + TestAC(int argc, char *argv[], const bool silent, + const std::string &name, const std::vector &options, + const Routine clblast_lambda, const Routine clblas_lambda); + + // The test functions, taking no inputs + void TestRegular(Arguments &args, const std::string &name); + void TestInvalidBufferSizes(Arguments &args, const std::string &name); + + private: + + // Source data to test with + std::vector a_source_; + std::vector c_source_; + + // The routines to test + Routine clblast_lambda_; + Routine clblas_lambda_; +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_CORRECTNESS_TESTAC_H_ +#endif diff --git a/test/performance/client.cc b/test/performance/client.cc index 3b07970c..b089f925 100644 --- a/test/performance/client.cc +++ b/test/performance/client.cc @@ -115,7 +115,7 @@ void ClientAXY(int argc, char *argv[], Routine3 client_routine, auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n; // Computes the vector sizes in case the matrix is transposed - auto a_transposed = (args.a_transpose == Transpose::kYes); + auto a_transposed = (args.a_transpose != Transpose::kNo); auto m_real = (a_transposed) ? args.n : args.m; auto n_real = (a_transposed) ? args.m : args.n; @@ -163,13 +163,13 @@ template void ClientAXY(int, char **, Routine3, const std::vec // ================================================================================================= -// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine. +// This is the matrix-matrix variant of the set-up/tear-down client routine. template -void ClientABC(int argc, char *argv[], Routine3 client_routine, - const std::vector &options) { +void ClientAC(int argc, char *argv[], Routine2 client_routine, + const std::vector &options) { // Function to determine how to find the default value of the leading dimension of matrix A - auto default_ld_a = [](const Arguments args) { return args.m; }; + auto default_ld_a = [](const Arguments args) { return args.k; }; // Simple command line argument parser with defaults auto args = ParseArguments(argc, argv, options, default_ld_a); @@ -188,9 +188,83 @@ void ClientABC(int argc, char *argv[], Routine3 client_routine, // Computes whether or not the matrices are transposed. Note that we assume a default of // column-major and no-transpose. If one of them is different (but not both), then rotated // is considered true. - auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) || + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); - auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) || + + // Iterates over all "num_step" values jumping by "step" each time + auto s = size_t{0}; + while(true) { + + // Computes the data sizes + auto a_two = (a_rotated) ? args.n : args.k; + auto a_size = a_two * args.a_ld + args.a_offset; + auto c_size = args.n * args.c_ld + args.c_offset; + + // Populates input host matrices with random data + std::vector a_source(a_size); + std::vector c_source(c_size); + PopulateVector(a_source); + PopulateVector(c_source); + + // Creates the matrices on the device + auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T)); + auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T)); + a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source); + c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source); + + // Runs the routine-specific code + client_routine(args, a_buffer, c_buffer, queue); + + // Makes the jump to the next step + ++s; + if (s >= args.num_steps) { break; } + args.n += args.step; + args.k += args.step; + args.a_ld += args.step; + args.c_ld += args.step; + } + + // Cleans-up and returns + if (args.compare_clblas) { clblasTeardown(); } +} + +// Compiles the above function +template void ClientAC(int, char **, Routine2, const std::vector&); +template void ClientAC(int, char **, Routine2, const std::vector&); +template void ClientAC(int, char **, Routine2, const std::vector&); +template void ClientAC(int, char **, Routine2, const std::vector&); + +// ================================================================================================= + +// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine. +template +void ClientABC(int argc, char *argv[], Routine3 client_routine, + const std::vector &options, const bool symmetric) { + + // Function to determine how to find the default value of the leading dimension of matrix A + auto default_ld_a = [&symmetric](const Arguments args) { return (symmetric) ? args.n : args.m; }; + + // Simple command line argument parser with defaults + auto args = ParseArguments(argc, argv, options, default_ld_a); + if (args.print_help) { return; } + if (symmetric) { args.m = args.n; } + + // Prints the header of the output table + PrintTableHeader(args.silent, options); + + // Initializes OpenCL and the libraries + auto platform = Platform(args.platform_id); + auto device = Device(platform, kDeviceType, args.device_id); + auto context = Context(device); + auto queue = CommandQueue(context, device); + if (args.compare_clblas) { clblasSetup(); } + + // Computes whether or not the matrices are transposed. Note that we assume a default of + // column-major and no-transpose. If one of them is different (but not both), then rotated + // is considered true. + auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) || + (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo); + auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) || (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo); auto c_rotated = (args.layout == Layout::kRowMajor); @@ -241,10 +315,10 @@ void ClientABC(int argc, char *argv[], Routine3 client_routine, } // Compiles the above function -template void ClientABC(int, char **, Routine3, const std::vector&); -template void ClientABC(int, char **, Routine3, const std::vector&); -template void ClientABC(int, char **, Routine3, const std::vector&); -template void ClientABC(int, char **, Routine3, const std::vector&); +template void ClientABC(int, char **, Routine3, const std::vector&, const bool); +template void ClientABC(int, char **, Routine3, const std::vector&, const bool); +template void ClientABC(int, char **, Routine3, const std::vector&, const bool); +template void ClientABC(int, char **, Routine3, const std::vector&, const bool); // ================================================================================================= diff --git a/test/performance/client.h b/test/performance/client.h index 5125844a..097ae048 100644 --- a/test/performance/client.h +++ b/test/performance/client.h @@ -52,8 +52,11 @@ template void ClientAXY(int argc, char *argv[], Routine3 client_routine, const std::vector &options); template +void ClientAC(int argc, char *argv[], Routine2 client_routine, + const std::vector &options); +template void ClientABC(int argc, char *argv[], Routine3 client_routine, - const std::vector &options); + const std::vector &options, const bool symmetric); // ================================================================================================= diff --git a/test/performance/graphs/xsyr2k.r b/test/performance/graphs/xsyr2k.r new file mode 100644 index 00000000..0c6c0de2 --- /dev/null +++ b/test/performance/graphs/xsyr2k.r @@ -0,0 +1,94 @@ + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project uses a tab-size of two spaces and a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# This file implements the performance script for the Xsyr2k routine +# +# ================================================================================================== + +# Includes the common functions +args <- commandArgs(trailingOnly = FALSE) +thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)]))) +source(file.path(dirname(thisfile), "common.r")) + +# ================================================================================================== + +# Settings +routine_name <- "xsyr2k" +parameters <- c("-n","-k","-layout","-triangle","-transA", + "-num_steps","-step","-runs","-precision") +precision <- 32 + +# Sets the names of the test-cases +test_names <- list( + "multiples of 128", + "multiples of 128 (+1)", + "around n=k=512", + "around n=k=1536", + "layouts and transposing (n=k=1024)", + "powers of 2" +) + +# Defines the test-cases +test_values <- list( + list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)), + list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)), + list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)), + list(c(1536, 1536, 0, 0, 0, 16, 1, num_runs, precision)), + list( + c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), + c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), + c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), + c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + ), + list( + c(8, 8, 0, 0, 0, 1, 0, num_runs, precision), + c(16, 16, 0, 0, 0, 1, 0, num_runs, precision), + c(32, 32, 0, 0, 0, 1, 0, num_runs, precision), + c(64, 64, 0, 0, 0, 1, 0, num_runs, precision), + c(128, 128, 0, 0, 0, 1, 0, num_runs, precision), + c(256, 256, 0, 0, 0, 1, 0, num_runs, precision), + c(512, 512, 0, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), + c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision), + c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision), + c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision) + ) +) + +# Defines the x-labels corresponding to the test-cases +test_xlabels <- list( + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "layout (row/col), triangle (u/l), transA (n/y)", + "matrix sizes (n=k)" +) + +# Defines the x-axis of the test-cases +test_xaxis <- list( + c("n", ""), + c("n", ""), + c("n", ""), + c("n", ""), + list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y", + "col,u,n", "col,u,y", "col,l,n", "col,l,y")), + c("n", "x") +) + +# ================================================================================================== + +# Start the script +main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values, + test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE) + +# ================================================================================================== \ No newline at end of file diff --git a/test/performance/graphs/xsyrk.r b/test/performance/graphs/xsyrk.r new file mode 100644 index 00000000..fe8598e9 --- /dev/null +++ b/test/performance/graphs/xsyrk.r @@ -0,0 +1,94 @@ + +# ================================================================================================== +# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +# project uses a tab-size of two spaces and a max-width of 100 characters per line. +# +# Author(s): +# Cedric Nugteren +# +# This file implements the performance script for the Xsyrk routine +# +# ================================================================================================== + +# Includes the common functions +args <- commandArgs(trailingOnly = FALSE) +thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)]))) +source(file.path(dirname(thisfile), "common.r")) + +# ================================================================================================== + +# Settings +routine_name <- "xsyrk" +parameters <- c("-n","-k","-layout","-triangle","-transA", + "-num_steps","-step","-runs","-precision") +precision <- 32 + +# Sets the names of the test-cases +test_names <- list( + "multiples of 128", + "multiples of 128 (+1)", + "around n=k=512", + "around n=k=2048", + "layouts and transposing (n=k=1024)", + "powers of 2" +) + +# Defines the test-cases +test_values <- list( + list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)), + list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)), + list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)), + list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)), + list( + c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision), + c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision), + c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision), + c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision), + c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision) + ), + list( + c(8, 8, 0, 0, 0, 1, 0, num_runs, precision), + c(16, 16, 0, 0, 0, 1, 0, num_runs, precision), + c(32, 32, 0, 0, 0, 1, 0, num_runs, precision), + c(64, 64, 0, 0, 0, 1, 0, num_runs, precision), + c(128, 128, 0, 0, 0, 1, 0, num_runs, precision), + c(256, 256, 0, 0, 0, 1, 0, num_runs, precision), + c(512, 512, 0, 0, 0, 1, 0, num_runs, precision), + c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision), + c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision), + c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision), + c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision) + ) +) + +# Defines the x-labels corresponding to the test-cases +test_xlabels <- list( + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "matrix sizes (n=k)", + "layout (row/col), triangle (u/l), transA (n/y)", + "matrix sizes (n=k)" +) + +# Defines the x-axis of the test-cases +test_xaxis <- list( + c("n", ""), + c("n", ""), + c("n", ""), + c("n", ""), + list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y", + "col,u,n", "col,u,y", "col,l,n", "col,l,y")), + c("n", "x") +) + +# ================================================================================================== + +# Start the script +main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values, + test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE) + +# ================================================================================================== \ No newline at end of file diff --git a/test/performance/routines/xgemm.cc b/test/performance/routines/xgemm.cc index 234e9fdb..76e398e0 100644 --- a/test/performance/routines/xgemm.cc +++ b/test/performance/routines/xgemm.cc @@ -72,7 +72,7 @@ void PerformanceXgemm(const Arguments &args, // Prints the performance of both libraries const auto flops = 2 * args.m * args.n * args.k; - const auto bytes = (args.m*args.k + args.k*args.n + args.m*args.n) * sizeof(T); + const auto bytes = (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T); const auto output_ints = std::vector{args.m, args.n, args.k, static_cast(args.layout), static_cast(args.a_transpose), @@ -96,10 +96,10 @@ void ClientXgemm(int argc, char *argv[]) { kArgAlpha, kArgBeta}; switch(GetPrecision(argc, argv)) { case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case Precision::kSingle: ClientABC(argc, argv, PerformanceXgemm, o); break; - case Precision::kDouble: ClientABC(argc, argv, PerformanceXgemm, o); break; - case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: ClientABC(argc, argv, PerformanceXgemm, o, false); break; + case Precision::kDouble: ClientABC(argc, argv, PerformanceXgemm, o, false); break; + case Precision::kComplexSingle: ClientABC(argc, argv, PerformanceXgemm, o, false); break; + case Precision::kComplexDouble: ClientABC(argc, argv, PerformanceXgemm, o, false); break; } } diff --git a/test/performance/routines/xsymm.cc b/test/performance/routines/xsymm.cc index 13ad434a..d78d4eb8 100644 --- a/test/performance/routines/xsymm.cc +++ b/test/performance/routines/xsymm.cc @@ -72,7 +72,7 @@ void PerformanceXsymm(const Arguments &args, // Prints the performance of both libraries const auto flops = 2 * args.m * args.n * args.m; - const auto bytes = (args.m*args.m + args.m*args.n + args.m*args.n) * sizeof(T); + const auto bytes = (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T); const auto output_ints = std::vector{args.m, args.n, static_cast(args.layout), static_cast(args.triangle), @@ -96,10 +96,10 @@ void ClientXsymm(int argc, char *argv[]) { kArgAlpha, kArgBeta}; switch(GetPrecision(argc, argv)) { case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); - case Precision::kSingle: ClientABC(argc, argv, PerformanceXsymm, o); break; - case Precision::kDouble: ClientABC(argc, argv, PerformanceXsymm, o); break; - case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode"); - case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: ClientABC(argc, argv, PerformanceXsymm, o, false); break; + case Precision::kDouble: ClientABC(argc, argv, PerformanceXsymm, o, false); break; + case Precision::kComplexSingle: ClientABC(argc, argv, PerformanceXsymm, o, false); break; + case Precision::kComplexDouble: ClientABC(argc, argv, PerformanceXsymm, o, false); break; } } diff --git a/test/performance/routines/xsyr2k.cc b/test/performance/routines/xsyr2k.cc new file mode 100644 index 00000000..8d9871d0 --- /dev/null +++ b/test/performance/routines/xsyr2k.cc @@ -0,0 +1,115 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyr2k command-line interface tester. +// +// ================================================================================================= + +#include +#include +#include + +#include "wrapper_clblas.h" +#include "performance/client.h" + +namespace clblast { +// ================================================================================================= + +// The client, used for performance testing. It contains the function calls to CLBlast and to other +// libraries to compare against. +template +void PerformanceXsyr2k(const Arguments &args, + const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat, + CommandQueue &queue) { + + // Creates the CLBlast lambda + auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syr2k(args.layout, args.triangle, args.a_transpose, + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + b_mat(), args.b_offset, args.b_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + if (status != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast error: "+ToString(static_cast(status))); + } + }; + + // Creates the clBLAS lambda (for comparison) + auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyr2k(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + b_mat(), args.b_offset, args.b_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + if (status != CL_SUCCESS) { + throw std::runtime_error("clBLAS error: "+ToString(static_cast(status))); + } + }; + + // Runs the routines and collect the timings + auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda); + auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda); + + // Prints the performance of both libraries + const auto flops = 2 * args.n * args.n * args.k; + const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T); + const auto output_ints = std::vector{args.n, args.k, + static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.a_ld, args.b_ld, args.c_ld, + args.a_offset, args.b_offset, args.c_offset}; + const auto output_strings = std::vector{ToString(args.alpha), + ToString(args.beta)}; + PrintTableRow(output_ints, output_strings, args.no_abbrv, + ms_clblast, ms_clblas, flops, bytes); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void ClientXsyr2k(int argc, char *argv[]) { + const auto o = std::vector{kArgN, kArgK, + kArgLayout, kArgTriangle, kArgATransp, + kArgALeadDim, kArgBLeadDim, kArgCLeadDim, + kArgAOffset, kArgBOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: ClientABC(argc, argv, PerformanceXsyr2k, o, true); break; + case Precision::kDouble: ClientABC(argc, argv, PerformanceXsyr2k, o, true); break; + case Precision::kComplexSingle: ClientABC(argc, argv, PerformanceXsyr2k, o, true); break; + case Precision::kComplexDouble: ClientABC(argc, argv, PerformanceXsyr2k, o, true); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::ClientXsyr2k(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/test/performance/routines/xsyrk.cc b/test/performance/routines/xsyrk.cc new file mode 100644 index 00000000..f36d665a --- /dev/null +++ b/test/performance/routines/xsyrk.cc @@ -0,0 +1,113 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xsyrk command-line interface tester. +// +// ================================================================================================= + +#include +#include +#include + +#include "wrapper_clblas.h" +#include "performance/client.h" + +namespace clblast { +// ================================================================================================= + +// The client, used for performance testing. It contains the function calls to CLBlast and to other +// libraries to compare against. +template +void PerformanceXsyrk(const Arguments &args, + const Buffer &a_mat, const Buffer &c_mat, + CommandQueue &queue) { + + // Creates the CLBlast lambda + auto clblast_lambda = [&args, &a_mat, &c_mat, &queue]() { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Syrk(args.layout, args.triangle, args.a_transpose, + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + &queue_plain, &event); + clWaitForEvents(1, &event); + if (status != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast error: "+ToString(static_cast(status))); + } + }; + + // Creates the clBLAS lambda (for comparison) + auto clblas_lambda = [&args, &a_mat, &c_mat, &queue]() { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXsyrk(static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.n, args.k, + args.alpha, + a_mat(), args.a_offset, args.a_ld, + args.beta, + c_mat(), args.c_offset, args.c_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + if (status != CL_SUCCESS) { + throw std::runtime_error("clBLAS error: "+ToString(static_cast(status))); + } + }; + + // Runs the routines and collect the timings + auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda); + auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda); + + // Prints the performance of both libraries + const auto flops = args.n * args.n * args.k; + const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T); + const auto output_ints = std::vector{args.n, args.k, + static_cast(args.layout), + static_cast(args.triangle), + static_cast(args.a_transpose), + args.a_ld, args.c_ld, + args.a_offset, args.c_offset}; + const auto output_strings = std::vector{ToString(args.alpha), + ToString(args.beta)}; + PrintTableRow(output_ints, output_strings, args.no_abbrv, + ms_clblast, ms_clblas, flops, bytes); +} + +// ================================================================================================= + +// Main function which calls the common client code with the routine-specific function as argument. +void ClientXsyrk(int argc, char *argv[]) { + const auto o = std::vector{kArgN, kArgK, + kArgLayout, kArgTriangle, kArgATransp, + kArgALeadDim, kArgCLeadDim, + kArgAOffset, kArgCOffset, + kArgAlpha, kArgBeta}; + switch(GetPrecision(argc, argv)) { + case Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case Precision::kSingle: ClientAC(argc, argv, PerformanceXsyrk, o); break; + case Precision::kDouble: ClientAC(argc, argv, PerformanceXsyrk, o); break; + case Precision::kComplexSingle: ClientAC(argc, argv, PerformanceXsyrk, o); break; + case Precision::kComplexDouble: ClientAC(argc, argv, PerformanceXsyrk, o); break; + } +} + +// ================================================================================================= +} // namespace clblast + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + clblast::ClientXsyrk(argc, argv); + return 0; +} + +// ================================================================================================= diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 093a8742..d6df0835 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -201,7 +201,7 @@ clblasStatus clblasXgemm( num_queues, queues, num_wait_events, wait_events, events); } -// This calls {clblasSsymm, clblasDsymm} with the arguments forwarded. +// This calls {clblasSsymm, clblasDsymm, clblasCsymm, clblasZsymm} with the arguments forwarded. clblasStatus clblasXsymm( clblasOrder layout, clblasSide side, clblasUplo triangle, size_t m, size_t n, float alpha, @@ -267,6 +267,130 @@ clblasStatus clblasXsymm( num_queues, queues, num_wait_events, wait_events, events); } +// This calls {clblasSsyrk, clblasDsyrk, clblasCsyrk, clblasZsyrk} with the arguments forwarded. +clblasStatus clblasXsyrk( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a, + size_t n, size_t k, float alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyrk(layout, triangle, tran_a, + n, k, alpha, + a_mat, a_offset, a_ld, beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyrk( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a, + size_t n, size_t k, double alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyrk(layout, triangle, tran_a, + n, k, alpha, + a_mat, a_offset, a_ld, beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyrk( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a, + size_t n, size_t k, float2 alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, float2 beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; + auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; + return clblasCsyrk(layout, triangle, tran_a, + n, k, cl_alpha, + a_mat, a_offset, a_ld, cl_beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyrk( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a, + size_t n, size_t k, double2 alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, double2 beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; + auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; + return clblasZsyrk(layout, triangle, tran_a, + n, k, cl_alpha, + a_mat, a_offset, a_ld, cl_beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + +// This calls {clblasSsyr2k, clblasDsyr2k, clblasCsyr2k, clblasZsyr2k} with the arguments forwarded. +clblasStatus clblasXsyr2k( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab, + size_t n, size_t k, float alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, + const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasSsyr2k(layout, triangle, tran_ab, + n, k, alpha, + a_mat, a_offset, a_ld, + b_mat, b_offset, b_ld, beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2k( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab, + size_t n, size_t k, double alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, + const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + return clblasDsyr2k(layout, triangle, tran_ab, + n, k, alpha, + a_mat, a_offset, a_ld, + b_mat, b_offset, b_ld, beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2k( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab, + size_t n, size_t k, float2 alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, + const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}}; + auto cl_beta = cl_float2{{beta.real(), beta.imag()}}; + return clblasCsyr2k(layout, triangle, tran_ab, + n, k, cl_alpha, + a_mat, a_offset, a_ld, + b_mat, b_offset, b_ld, cl_beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} +clblasStatus clblasXsyr2k( + clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab, + size_t n, size_t k, double2 alpha, + const cl_mem a_mat, size_t a_offset, size_t a_ld, + const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta, + cl_mem c_mat, size_t c_offset, size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}}; + auto cl_beta = cl_double2{{beta.real(), beta.imag()}}; + return clblasZsyr2k(layout, triangle, tran_ab, + n, k, cl_alpha, + a_mat, a_offset, a_ld, + b_mat, b_offset, b_ld, cl_beta, + c_mat, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); +} + // ================================================================================================= } // namespace clblast