mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-08-21 04:22:27 +02:00
Merge pull request #9 from CNugteren/level3_routines
Added SYRK and SYR2K level-3 routines
This commit is contained in:
commit
77e2157485
|
@ -1,4 +1,9 @@
|
|||
|
||||
Development version (next release)
|
||||
- Added level-3 routines:
|
||||
* SSYRK/DSYRK/CSYRK/ZSYRK
|
||||
* SSYR2K/DSYR2K/CSYR2K/ZSYR2K
|
||||
|
||||
Version 0.2.0
|
||||
- Added support for complex conjugate transpose
|
||||
- Several host-code performance improvements
|
||||
|
|
|
@ -97,8 +97,10 @@ set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
|
|||
set(SAMPLE_PROGRAMS sgemm)
|
||||
set(ROUTINES_XY xaxpy)
|
||||
set(ROUTINES_AXY xgemv)
|
||||
set(ROUTINES_ABC xgemm xsymm)
|
||||
set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC})
|
||||
set(ROUTINES_ABC xgemm xsymm xsyr2k)
|
||||
set(ROUTINES_AB )
|
||||
set(ROUTINES_AC xsyrk)
|
||||
set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC} ${ROUTINES_AB} ${ROUTINES_AC})
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
|
@ -172,6 +174,8 @@ if(TESTS)
|
|||
add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
|
||||
add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
|
||||
add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
|
||||
add_library(test_correctness_ab OBJECT test/correctness/testab.cc)
|
||||
add_library(test_correctness_ac OBJECT test/correctness/testac.cc)
|
||||
|
||||
# Compiles the correctness-tests
|
||||
foreach(ROUTINE ${ROUTINES_XY})
|
||||
|
@ -179,22 +183,32 @@ if(TESTS)
|
|||
$<TARGET_OBJECTS:test_correctness_common>
|
||||
$<TARGET_OBJECTS:test_correctness_xy>
|
||||
test/correctness/routines/${ROUTINE}.cc)
|
||||
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
|
||||
install(TARGETS test_${ROUTINE} DESTINATION bin)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${ROUTINES_AXY})
|
||||
add_executable(test_${ROUTINE}
|
||||
$<TARGET_OBJECTS:test_correctness_common>
|
||||
$<TARGET_OBJECTS:test_correctness_axy>
|
||||
test/correctness/routines/${ROUTINE}.cc)
|
||||
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
|
||||
install(TARGETS test_${ROUTINE} DESTINATION bin)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${ROUTINES_ABC})
|
||||
add_executable(test_${ROUTINE}
|
||||
$<TARGET_OBJECTS:test_correctness_common>
|
||||
$<TARGET_OBJECTS:test_correctness_abc>
|
||||
test/correctness/routines/${ROUTINE}.cc)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${ROUTINES_AB})
|
||||
add_executable(test_${ROUTINE}
|
||||
$<TARGET_OBJECTS:test_correctness_common>
|
||||
$<TARGET_OBJECTS:test_correctness_ab>
|
||||
test/correctness/routines/${ROUTINE}.cc)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${ROUTINES_AC})
|
||||
add_executable(test_${ROUTINE}
|
||||
$<TARGET_OBJECTS:test_correctness_common>
|
||||
$<TARGET_OBJECTS:test_correctness_ac>
|
||||
test/correctness/routines/${ROUTINE}.cc)
|
||||
endforeach()
|
||||
foreach(ROUTINE ${ROUTINES})
|
||||
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
|
||||
install(TARGETS test_${ROUTINE} DESTINATION bin)
|
||||
endforeach()
|
||||
|
|
14
README.md
14
README.md
|
@ -124,7 +124,7 @@ These graphs can be generated automatically on your own device. First, compile C
|
|||
Supported routines
|
||||
-------------
|
||||
|
||||
CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with `x` in the following tables:
|
||||
CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
|
||||
|
||||
| Level-1 | S | D | C | Z | Notes |
|
||||
| ---------|---|---|---|---|---------|
|
||||
|
@ -135,7 +135,7 @@ CLBlast is in active development and currently does not support the full set of
|
|||
| xSWAP | | | | | |
|
||||
| xSCAL | | | | | +CS +ZD |
|
||||
| xCOPY | | | | | |
|
||||
| xAXPY |`x`|`x`|`x`|`x`| |
|
||||
| xAXPY | ✔ | ✔ | ✔ | ✔ | |
|
||||
| xDOT | | | - | - | +DS |
|
||||
| xDOTU | - | - | | | |
|
||||
| xDOTC | - | - | | | |
|
||||
|
@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of
|
|||
|
||||
| Level-2 | S | D | C | Z | Notes |
|
||||
| ---------|---|---|---|---|---------|
|
||||
| xGEMV |`x`|`x`|`x`|`x`| |
|
||||
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
|
||||
| xGBMV | | | | | |
|
||||
| xHEMV | - | - | | | |
|
||||
| xHBMV | - | - | | | |
|
||||
|
@ -175,12 +175,12 @@ CLBlast is in active development and currently does not support the full set of
|
|||
|
||||
| Level-3 | S | D | C | Z | Notes |
|
||||
| ---------|---|---|---|---|---------|
|
||||
| xGEMM |`x`|`x`|`x`|`x`| |
|
||||
| xSYMM |`x`|`x`|`x`|`x`| |
|
||||
| xGEMM | ✔ | ✔ | ✔ | ✔ | |
|
||||
| xSYMM | ✔ | ✔ | ✔ | ✔ | |
|
||||
| xHEMM | - | - | | | |
|
||||
| xSYRK | | | | | |
|
||||
| xSYRK | ✔ | ✔ | ✔ | ✔ | |
|
||||
| xHERK | - | - | | | |
|
||||
| xSYR2K | | | | | |
|
||||
| xSYR2K | ✔ | ✔ | ✔ | ✔ | |
|
||||
| xHER2K | - | - | | | |
|
||||
| xTRMM | | | | | |
|
||||
| xTRSM | | | | | |
|
||||
|
|
BIN
doc/performance/Iris/SSYR2K.pdf
Normal file
BIN
doc/performance/Iris/SSYR2K.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Iris/SSYRK.pdf
Normal file
BIN
doc/performance/Iris/SSYRK.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Tesla_K40m/SSYRK.pdf
Normal file
BIN
doc/performance/Tesla_K40m/SSYRK.pdf
Normal file
Binary file not shown.
|
@ -107,7 +107,7 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
|
|||
// =================================================================================================
|
||||
// BLAS level-3 (matrix-matrix) routines
|
||||
|
||||
// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM
|
||||
// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
|
||||
template <typename T>
|
||||
StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
|
||||
const size_t m, const size_t n, const size_t k,
|
||||
|
@ -118,7 +118,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
|
|||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM
|
||||
// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
|
||||
template <typename T>
|
||||
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
||||
const size_t m, const size_t n,
|
||||
|
@ -129,6 +129,27 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
|||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
|
||||
template <typename T>
|
||||
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose transpose_a,
|
||||
const size_t n, const size_t k,
|
||||
const T alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const T beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
|
||||
template <typename T>
|
||||
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose transpose_ab,
|
||||
const size_t n, const size_t k,
|
||||
const T alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const T beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
|
|
|
@ -92,7 +92,8 @@ class Routine {
|
|||
const size_t dest_ld, const size_t dest_offset,
|
||||
const Buffer &dest,
|
||||
const bool do_transpose, const bool do_conjugate,
|
||||
const bool pad, const Program &program);
|
||||
const bool pad, const bool upper, const bool lower,
|
||||
const Program &program);
|
||||
|
||||
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
|
||||
// The first assumes that the program is available in the cache and will throw an exception
|
||||
|
|
48
include/internal/routines/xsyr2k.h
Normal file
48
include/internal/routines/xsyr2k.h
Normal file
|
@ -0,0 +1,48 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
|
||||
// The implementation is very similar to Xsyrk (see header for details), except for the fact that
|
||||
// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_ROUTINES_XSYR2K_H_
|
||||
#define CLBLAST_ROUTINES_XSYR2K_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xsyr2k: public Routine {
|
||||
public:
|
||||
Xsyr2k(CommandQueue &queue, Event &event);
|
||||
|
||||
// Templated-precision implementation of the routine
|
||||
StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const T alpha,
|
||||
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const T beta,
|
||||
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_ROUTINES_XSYR2K_H_
|
||||
#endif
|
49
include/internal/routines/xsyrk.h
Normal file
49
include/internal/routines/xsyrk.h
Normal file
|
@ -0,0 +1,49 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Xsyrk routine. The precision is implemented using a template argument.
|
||||
// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
|
||||
// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
|
||||
// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
|
||||
// performance reasons, as the actual masking is done later (see the first point).
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_ROUTINES_XSYRK_H_
|
||||
#define CLBLAST_ROUTINES_XSYRK_H_
|
||||
|
||||
#include "internal/routine.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xsyrk: public Routine {
|
||||
public:
|
||||
Xsyrk(CommandQueue &queue, Event &event);
|
||||
|
||||
// Templated-precision implementation of the routine
|
||||
StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const T alpha,
|
||||
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const T beta,
|
||||
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
|
||||
|
||||
private:
|
||||
// Static variable to get the precision
|
||||
const static Precision precision_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_ROUTINES_XSYRK_H_
|
||||
#endif
|
118
src/clblast.cc
118
src/clblast.cc
|
@ -26,6 +26,8 @@
|
|||
// BLAS level-3 includes
|
||||
#include "internal/routines/xgemm.h"
|
||||
#include "internal/routines/xsymm.h"
|
||||
#include "internal/routines/xsyrk.h"
|
||||
#include "internal/routines/xsyr2k.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
@ -209,7 +211,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
|||
std::string kernel_source =
|
||||
#include "kernels/xgemm.opencl"
|
||||
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
|
||||
kernel_source);
|
||||
kernel_source);
|
||||
if (status != StatusCode::kSuccess) { return status; }
|
||||
|
||||
// Runs the routine
|
||||
|
@ -243,5 +245,119 @@ template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
|
|||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// SYRK
|
||||
template <typename T>
|
||||
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose transpose_a,
|
||||
const size_t n, const size_t k, const T alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto queue_cpp = CommandQueue(*queue);
|
||||
auto event_cpp = Event(*event);
|
||||
auto routine = Xsyrk<T>(queue_cpp, event_cpp);
|
||||
|
||||
// Loads the kernel source-code as an include (C++11 raw string literal)
|
||||
std::string common_source1 =
|
||||
#include "kernels/copy.opencl"
|
||||
std::string common_source2 =
|
||||
#include "kernels/pad.opencl"
|
||||
std::string common_source3 =
|
||||
#include "kernels/transpose.opencl"
|
||||
std::string common_source4 =
|
||||
#include "kernels/padtranspose.opencl"
|
||||
std::string kernel_source =
|
||||
#include "kernels/xgemm.opencl"
|
||||
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
|
||||
kernel_source);
|
||||
if (status != StatusCode::kSuccess) { return status; }
|
||||
|
||||
// Runs the routine
|
||||
return routine.DoSyrk(layout, triangle, transpose_a, n, k, alpha,
|
||||
Buffer(a_buffer), a_offset, a_ld, beta,
|
||||
Buffer(c_buffer), c_offset, c_ld);
|
||||
}
|
||||
template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose,
|
||||
const size_t, const size_t, const float,
|
||||
const cl_mem, const size_t, const size_t, const float,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode Syrk<double>(const Layout, const Triangle, const Transpose,
|
||||
const size_t, const size_t, const double,
|
||||
const cl_mem, const size_t, const size_t, const double,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode Syrk<float2>(const Layout, const Triangle, const Transpose,
|
||||
const size_t, const size_t, const float2,
|
||||
const cl_mem, const size_t, const size_t, const float2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode Syrk<double2>(const Layout, const Triangle, const Transpose,
|
||||
const size_t, const size_t, const double2,
|
||||
const cl_mem, const size_t, const size_t, const double2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// SYR2K
|
||||
template <typename T>
|
||||
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose transpose_ab,
|
||||
const size_t n, const size_t k, const T alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto queue_cpp = CommandQueue(*queue);
|
||||
auto event_cpp = Event(*event);
|
||||
auto routine = Xsyr2k<T>(queue_cpp, event_cpp);
|
||||
|
||||
// Loads the kernel source-code as an include (C++11 raw string literal)
|
||||
std::string common_source1 =
|
||||
#include "kernels/copy.opencl"
|
||||
std::string common_source2 =
|
||||
#include "kernels/pad.opencl"
|
||||
std::string common_source3 =
|
||||
#include "kernels/transpose.opencl"
|
||||
std::string common_source4 =
|
||||
#include "kernels/padtranspose.opencl"
|
||||
std::string kernel_source =
|
||||
#include "kernels/xgemm.opencl"
|
||||
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
|
||||
kernel_source);
|
||||
if (status != StatusCode::kSuccess) { return status; }
|
||||
|
||||
// Runs the routine
|
||||
return routine.DoSyr2k(layout, triangle, transpose_ab, n, k, alpha,
|
||||
Buffer(a_buffer), a_offset, a_ld,
|
||||
Buffer(b_buffer), b_offset, b_ld, beta,
|
||||
Buffer(c_buffer), c_offset, c_ld);
|
||||
}
|
||||
template StatusCode Syr2k<float>(const Layout, const Triangle, const Transpose,
|
||||
const size_t, const size_t, const float,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t, const float,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode Syr2k<double>(const Layout, const Triangle, const Transpose,
|
||||
const size_t, const size_t, const double,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t, const double,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode Syr2k<float2>(const Layout, const Triangle, const Transpose,
|
||||
const size_t, const size_t, const float2,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t, const float2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode Syr2k<double2>(const Layout, const Triangle, const Transpose,
|
||||
const size_t, const size_t, const double2,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t, const double2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
|
|
@ -86,7 +86,8 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
|
|||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
__global real* dest,
|
||||
const int upper, const int lower) {
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
#pragma unroll
|
||||
|
@ -95,11 +96,18 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
|
|||
#pragma unroll
|
||||
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
|
||||
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
|
||||
if (id_two < dest_two && id_one < dest_one) {
|
||||
|
||||
// Masking in case of triangular matrices: updates only the upper or lower part
|
||||
bool condition = true;
|
||||
if (upper == 1) { condition = (id_two >= id_one); }
|
||||
else if (lower == 1) { condition = (id_two <= id_one); }
|
||||
if (condition) {
|
||||
|
||||
// Copies the value into the destination matrix. This is always within bounds of the source
|
||||
// matrix, as we know that the destination matrix is smaller than the source.
|
||||
dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
|
||||
if (id_two < dest_two && id_one < dest_one) {
|
||||
dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -100,7 +100,8 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
|
|||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
__global real* dest,
|
||||
const int upper, const int lower) {
|
||||
|
||||
// Local memory to store a tile of the matrix (for coalescing)
|
||||
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
|
||||
|
@ -137,10 +138,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
|
|||
const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
|
||||
const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
|
||||
|
||||
// Stores the transposed value in the destination matrix
|
||||
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
|
||||
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
|
||||
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
|
||||
// Masking in case of triangular matrices: updates only the upper or lower part
|
||||
bool condition = true;
|
||||
if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
|
||||
else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
|
||||
if (condition) {
|
||||
|
||||
// Stores the transposed value in the destination matrix
|
||||
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
|
||||
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
|
||||
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -127,6 +127,55 @@ R"(
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
// Initializes the accumulation registers to zero
|
||||
inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
|
||||
#pragma unroll
|
||||
for (int mi=0; mi<MWI/VWM; ++mi) {
|
||||
#pragma unroll
|
||||
for (int ni=0; ni<NWI; ++ni) {
|
||||
#if VWM == 1
|
||||
SetToZero(cpm[ni][mi]);
|
||||
#elif VWM == 2
|
||||
SetToZero(cpm[ni][mi].x);
|
||||
SetToZero(cpm[ni][mi].y);
|
||||
#elif VWM == 4
|
||||
SetToZero(cpm[ni][mi].x);
|
||||
SetToZero(cpm[ni][mi].y);
|
||||
SetToZero(cpm[ni][mi].z);
|
||||
SetToZero(cpm[ni][mi].w);
|
||||
#elif VWM == 8
|
||||
SetToZero(cpm[ni][mi].s0);
|
||||
SetToZero(cpm[ni][mi].s1);
|
||||
SetToZero(cpm[ni][mi].s2);
|
||||
SetToZero(cpm[ni][mi].s3);
|
||||
SetToZero(cpm[ni][mi].s4);
|
||||
SetToZero(cpm[ni][mi].s5);
|
||||
SetToZero(cpm[ni][mi].s6);
|
||||
SetToZero(cpm[ni][mi].s7);
|
||||
#elif VWM == 16
|
||||
SetToZero(cpm[ni][mi].s0);
|
||||
SetToZero(cpm[ni][mi].s1);
|
||||
SetToZero(cpm[ni][mi].s2);
|
||||
SetToZero(cpm[ni][mi].s3);
|
||||
SetToZero(cpm[ni][mi].s4);
|
||||
SetToZero(cpm[ni][mi].s5);
|
||||
SetToZero(cpm[ni][mi].s6);
|
||||
SetToZero(cpm[ni][mi].s7);
|
||||
SetToZero(cpm[ni][mi].s8);
|
||||
SetToZero(cpm[ni][mi].s9);
|
||||
SetToZero(cpm[ni][mi].sA);
|
||||
SetToZero(cpm[ni][mi].sB);
|
||||
SetToZero(cpm[ni][mi].sC);
|
||||
SetToZero(cpm[ni][mi].sD);
|
||||
SetToZero(cpm[ni][mi].sE);
|
||||
SetToZero(cpm[ni][mi].sF);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
|
||||
// caching the A input matrix.
|
||||
#if SA == 1
|
||||
|
@ -272,71 +321,6 @@ inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
|
||||
// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
|
||||
inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
|
||||
const real alpha, const real beta) {
|
||||
#pragma unroll
|
||||
for (int ni=0; ni<NWI; ++ni) {
|
||||
#pragma unroll
|
||||
for (int mi=0; mi<MWI/VWM; ++mi) {
|
||||
#if STRM == 0
|
||||
int mg = mi + get_local_id(0)*(MWI/VWM);
|
||||
#elif STRM == 1
|
||||
int mg = get_local_id(0) + mi*MDIMC;
|
||||
#endif
|
||||
#if STRN == 0
|
||||
int ng = ni + get_local_id(1)*NWI;
|
||||
#elif STRN == 1
|
||||
int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
|
||||
#endif
|
||||
int idm = mg + get_group_id(0)*(MWG/VWM);
|
||||
int idn = ng + get_group_id(1)*NWG;
|
||||
int index = idn*(kSizeM/VWM) + idm;
|
||||
realM cval = cgm[index];
|
||||
#if VWM == 1
|
||||
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
|
||||
#elif VWM == 2
|
||||
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
|
||||
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
|
||||
#elif VWM == 4
|
||||
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
|
||||
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
|
||||
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
|
||||
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
|
||||
#elif VWM == 8
|
||||
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
|
||||
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
|
||||
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
|
||||
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
|
||||
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
|
||||
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
|
||||
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
|
||||
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
|
||||
#elif VWM == 16
|
||||
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
|
||||
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
|
||||
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
|
||||
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
|
||||
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
|
||||
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
|
||||
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
|
||||
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
|
||||
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
|
||||
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
|
||||
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
|
||||
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
|
||||
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
|
||||
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
|
||||
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
|
||||
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// The vectorised multiply-add function
|
||||
inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
|
||||
#if USE_VECTOR_MAD == 1
|
||||
|
@ -432,77 +416,97 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
// Main entry of the kernel. This function contains the basic skeleton, the functionality is
|
||||
// provided by the inlined functions above
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
const real alpha, const real beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
|
||||
// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
|
||||
inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
|
||||
const real alpha, const real beta) {
|
||||
#pragma unroll
|
||||
for (int ni=0; ni<NWI; ++ni) {
|
||||
#pragma unroll
|
||||
for (int mi=0; mi<MWI/VWM; ++mi) {
|
||||
#if STRM == 0
|
||||
int mg = mi + get_local_id(0)*(MWI/VWM);
|
||||
#elif STRM == 1
|
||||
int mg = get_local_id(0) + mi*MDIMC;
|
||||
#endif
|
||||
#if STRN == 0
|
||||
int ng = ni + get_local_id(1)*NWI;
|
||||
#elif STRN == 1
|
||||
int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
|
||||
#endif
|
||||
int idm = mg + get_group_id(0)*(MWG/VWM);
|
||||
int idn = ng + get_group_id(1)*NWG;
|
||||
|
||||
// Combined thread identifier
|
||||
// The final multiplication with alpha and the addition with beta*C
|
||||
int index = idn*(kSizeM/VWM) + idm;
|
||||
realM cval = cgm[index];
|
||||
#if VWM == 1
|
||||
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
|
||||
#elif VWM == 2
|
||||
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
|
||||
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
|
||||
#elif VWM == 4
|
||||
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
|
||||
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
|
||||
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
|
||||
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
|
||||
#elif VWM == 8
|
||||
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
|
||||
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
|
||||
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
|
||||
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
|
||||
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
|
||||
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
|
||||
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
|
||||
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
|
||||
#elif VWM == 16
|
||||
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
|
||||
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
|
||||
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
|
||||
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
|
||||
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
|
||||
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
|
||||
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
|
||||
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
|
||||
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
|
||||
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
|
||||
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
|
||||
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
|
||||
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
|
||||
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
|
||||
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
|
||||
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
|
||||
inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
const __global realM* restrict agm, const __global realN* restrict bgm,
|
||||
__global realM* cgm, realM cpm[NWI][MWI/VWM],
|
||||
#if SA == 1 && SB == 1
|
||||
__local realM* alm, __local realN* blm
|
||||
#elif SA == 1
|
||||
__local realM* alm
|
||||
#elif SB == 1
|
||||
__local realN* blm
|
||||
#endif
|
||||
) {
|
||||
|
||||
// Allocates workitem-private memory (registers)
|
||||
realM apm[MWI/VWM];
|
||||
realN bpm[NWI/VWN];
|
||||
|
||||
// Combined thread identifier (volatile to disable caching)
|
||||
#if SA == 1 || SB == 1
|
||||
volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
|
||||
#endif
|
||||
|
||||
// Allocates workgroup-private memory (local memory)
|
||||
#if SA == 1
|
||||
__local realM alm[KWG * MWG/VWM];
|
||||
#endif
|
||||
#if SB == 1
|
||||
__local realN blm[KWG * NWG/VWN];
|
||||
#endif
|
||||
|
||||
// Allocates workitem-private memory (registers)
|
||||
realM apm[MWI/VWM];
|
||||
realN bpm[NWI/VWN];
|
||||
realM cpm[NWI][MWI/VWM];
|
||||
|
||||
// Initializes the accumulation registers
|
||||
#pragma unroll
|
||||
for (int mi=0; mi<MWI/VWM; ++mi) {
|
||||
#pragma unroll
|
||||
for (int ni=0; ni<NWI; ++ni) {
|
||||
#if VWM == 1
|
||||
SetToZero(cpm[ni][mi]);
|
||||
#elif VWM == 2
|
||||
SetToZero(cpm[ni][mi].x);
|
||||
SetToZero(cpm[ni][mi].y);
|
||||
#elif VWM == 4
|
||||
SetToZero(cpm[ni][mi].x);
|
||||
SetToZero(cpm[ni][mi].y);
|
||||
SetToZero(cpm[ni][mi].z);
|
||||
SetToZero(cpm[ni][mi].w);
|
||||
#elif VWM == 8
|
||||
SetToZero(cpm[ni][mi].s0);
|
||||
SetToZero(cpm[ni][mi].s1);
|
||||
SetToZero(cpm[ni][mi].s2);
|
||||
SetToZero(cpm[ni][mi].s3);
|
||||
SetToZero(cpm[ni][mi].s4);
|
||||
SetToZero(cpm[ni][mi].s5);
|
||||
SetToZero(cpm[ni][mi].s6);
|
||||
SetToZero(cpm[ni][mi].s7);
|
||||
#elif VWM == 16
|
||||
SetToZero(cpm[ni][mi].s0);
|
||||
SetToZero(cpm[ni][mi].s1);
|
||||
SetToZero(cpm[ni][mi].s2);
|
||||
SetToZero(cpm[ni][mi].s3);
|
||||
SetToZero(cpm[ni][mi].s4);
|
||||
SetToZero(cpm[ni][mi].s5);
|
||||
SetToZero(cpm[ni][mi].s6);
|
||||
SetToZero(cpm[ni][mi].s7);
|
||||
SetToZero(cpm[ni][mi].s8);
|
||||
SetToZero(cpm[ni][mi].s9);
|
||||
SetToZero(cpm[ni][mi].sA);
|
||||
SetToZero(cpm[ni][mi].sB);
|
||||
SetToZero(cpm[ni][mi].sC);
|
||||
SetToZero(cpm[ni][mi].sD);
|
||||
SetToZero(cpm[ni][mi].sE);
|
||||
SetToZero(cpm[ni][mi].sF);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
InitAccRegisters(cpm);
|
||||
|
||||
// Loops over all workgroup tiles
|
||||
for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
|
||||
|
@ -515,8 +519,6 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
#if SB == 1
|
||||
GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
|
||||
#endif
|
||||
|
||||
// Synchronizes all threads in a workgroup
|
||||
#if SA == 1 || SB == 1
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#endif
|
||||
|
@ -552,19 +554,126 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
MultiplyAccumulate(cpm, apm, bpm);
|
||||
}
|
||||
}
|
||||
|
||||
// Synchronizes all threads in a workgroup
|
||||
#if SA == 1 || SB == 1
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
|
||||
// =================================================================================================
|
||||
|
||||
// Main entry point of the kernel. This is the regular full version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
const real alpha, const real beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
|
||||
// Allocates workgroup-private memory (local memory)
|
||||
#if SA == 1
|
||||
__local realM alm[KWG * MWG/VWM];
|
||||
#endif
|
||||
#if SB == 1
|
||||
__local realN blm[KWG * NWG/VWN];
|
||||
#endif
|
||||
|
||||
// Computes the matrix-multiplication and stores the result in register memory
|
||||
realM cpm[NWI][MWI/VWM];
|
||||
#if SA == 1 && SB == 1
|
||||
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
|
||||
#elif SA == 1
|
||||
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
|
||||
#elif SB == 1
|
||||
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
|
||||
#else
|
||||
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
|
||||
#endif
|
||||
|
||||
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
|
||||
StoreResults(cgm, cpm, kSizeM, alpha, beta);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Main entry point of the kernel. This is the upper-triangular version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
|
||||
const real alpha, const real beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
|
||||
// Skip these threads if they do not contain threads contributing to the upper-triangle
|
||||
if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocates workgroup-private memory (local memory)
|
||||
#if SA == 1
|
||||
__local realM alm[KWG * MWG/VWM];
|
||||
#endif
|
||||
#if SB == 1
|
||||
__local realN blm[KWG * NWG/VWN];
|
||||
#endif
|
||||
|
||||
// Computes the matrix-multiplication and stores the result in register memory
|
||||
realM cpm[NWI][MWI/VWM];
|
||||
#if SA == 1 && SB == 1
|
||||
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
|
||||
#elif SA == 1
|
||||
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
|
||||
#elif SB == 1
|
||||
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
|
||||
#else
|
||||
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
|
||||
#endif
|
||||
|
||||
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
|
||||
StoreResults(cgm, cpm, kSizeN, alpha, beta);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Main entry point of the kernel. This is the lower-triangular version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void XgemmLower(const int kSizeN, const int kSizeK,
|
||||
const real alpha, const real beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
|
||||
// Skip these threads if they do not contain threads contributing to the lower-triangle
|
||||
if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocates workgroup-private memory (local memory)
|
||||
#if SA == 1
|
||||
__local realM alm[KWG * MWG/VWM];
|
||||
#endif
|
||||
#if SB == 1
|
||||
__local realN blm[KWG * NWG/VWN];
|
||||
#endif
|
||||
|
||||
// Computes the matrix-multiplication and stores the result in register memory
|
||||
realM cpm[NWI][MWI/VWM];
|
||||
#if SA == 1 && SB == 1
|
||||
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
|
||||
#elif SA == 1
|
||||
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
|
||||
#elif SB == 1
|
||||
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
|
||||
#else
|
||||
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
|
||||
#endif
|
||||
|
||||
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
|
||||
StoreResults(cgm, cpm, kSizeN, alpha, beta);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// End of the C++11 raw string literal
|
||||
)";
|
||||
|
||||
|
|
|
@ -210,11 +210,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
|
|||
const size_t dest_ld, const size_t dest_offset,
|
||||
const Buffer &dest,
|
||||
const bool do_transpose, const bool do_conjugate,
|
||||
const bool pad, const Program &program) {
|
||||
const bool pad, const bool upper, const bool lower,
|
||||
const Program &program) {
|
||||
|
||||
// Determines whether or not the fast-version could potentially be used
|
||||
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
|
||||
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
|
||||
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
|
||||
(upper == false) && (lower == false);
|
||||
|
||||
// Determines the right kernel
|
||||
auto kernel_name = std::string{};
|
||||
|
@ -267,6 +269,10 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
|
|||
if (pad) {
|
||||
kernel.SetArgument(10, static_cast<int>(do_conjugate));
|
||||
}
|
||||
else {
|
||||
kernel.SetArgument(10, static_cast<int>(upper));
|
||||
kernel.SetArgument(11, static_cast<int>(lower));
|
||||
}
|
||||
}
|
||||
|
||||
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
|
||||
|
|
|
@ -108,18 +108,18 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
// them up until they reach a certain multiple of size (kernel parameter dependent).
|
||||
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
|
||||
m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
|
||||
a_do_transpose, a_conjugate, true, program);
|
||||
a_do_transpose, a_conjugate, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
|
||||
n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
|
||||
b_do_transpose, b_conjugate, true, program);
|
||||
b_do_transpose, b_conjugate, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Only necessary for matrix C if it used both as input and output
|
||||
if (beta != static_cast<T>(0)) {
|
||||
status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
|
||||
m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
|
||||
c_do_transpose, false, true, program);
|
||||
c_do_transpose, false, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
}
|
||||
|
||||
|
@ -151,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
// Runs the post-processing kernel
|
||||
status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
|
||||
c_one, c_two, c_ld, c_offset, c_buffer,
|
||||
c_do_transpose, false, false, program);
|
||||
c_do_transpose, false, false, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Successfully finished the computation
|
||||
|
|
169
src/routines/xsyr2k.cc
Normal file
169
src/routines/xsyr2k.cc
Normal file
|
@ -0,0 +1,169 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Xsyr2k class (see the header for information about the class).
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/xsyr2k.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
|
||||
Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// The main routine
|
||||
template <typename T>
|
||||
StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const T alpha,
|
||||
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const T beta,
|
||||
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
|
||||
|
||||
// Makes sure all dimensions are larger than zero
|
||||
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
|
||||
|
||||
// Computes whether or not the matrices are transposed in memory. This is based on their layout
|
||||
// (row or column-major) and whether or not they are requested to be pre-transposed.
|
||||
auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
|
||||
(layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
|
||||
auto c_rotated = (layout == Layout::kRowMajor);
|
||||
|
||||
// In case of complex data-types, the transpose can also become a conjugate transpose
|
||||
auto ab_conjugate = (ab_transpose == Transpose::kConjugate);
|
||||
|
||||
// Computes the first and second dimensions of the A and B matrices taking the layout into account
|
||||
auto ab_one = (ab_rotated) ? k : n;
|
||||
auto ab_two = (ab_rotated) ? n : k;
|
||||
|
||||
// Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
|
||||
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
|
||||
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
|
||||
// space. Also tests that the leading dimensions of:
|
||||
// matrix A cannot be less than N when rotated, or less than K when not-rotated
|
||||
// matrix B cannot be less than N when rotated, or less than K when not-rotated
|
||||
// matrix C cannot be less than N
|
||||
auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Calculates the ceiled versions of n and k
|
||||
auto n_ceiled = Ceil(n, db_["NWG"]);
|
||||
auto k_ceiled = Ceil(k, db_["KWG"]);
|
||||
|
||||
// Decides which kernel to run: the upper-triangular or lower-triangular version
|
||||
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
|
||||
|
||||
// Allocates space on the device for padded and/or transposed input and output matrices.
|
||||
try {
|
||||
auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||
auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||
auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
|
||||
|
||||
// Loads the program from the database
|
||||
auto& program = GetProgramFromCache();
|
||||
|
||||
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
|
||||
// fill them up until they reach a certain multiple of size (kernel parameter dependent).
|
||||
status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
|
||||
n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
|
||||
ab_rotated, ab_conjugate, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
|
||||
n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
|
||||
ab_rotated, ab_conjugate, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
|
||||
// modify the other triangle.
|
||||
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
|
||||
n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
|
||||
c_rotated, false, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
|
||||
try {
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(2, alpha);
|
||||
kernel.SetArgument(3, beta);
|
||||
kernel.SetArgument(4, temp_a());
|
||||
kernel.SetArgument(5, temp_b());
|
||||
kernel.SetArgument(6, temp_c());
|
||||
|
||||
// Computes the global and local thread sizes
|
||||
auto global = std::vector<size_t>{
|
||||
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
|
||||
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
|
||||
};
|
||||
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
|
||||
|
||||
// Launches the kernel
|
||||
status = RunKernel(kernel, global, local);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
|
||||
auto one = static_cast<T>(1);
|
||||
kernel.SetArgument(3, one);
|
||||
kernel.SetArgument(4, temp_b());
|
||||
kernel.SetArgument(5, temp_a());
|
||||
|
||||
// Runs the kernel again
|
||||
status = RunKernel(kernel, global, local);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Runs the post-processing kernel
|
||||
auto upper = (triangle == Triangle::kUpper);
|
||||
auto lower = (triangle == Triangle::kLower);
|
||||
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
|
||||
n, n, c_ld, c_offset, c_buffer,
|
||||
c_rotated, false, false, upper, lower, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Successfully finished the computation
|
||||
return StatusCode::kSuccess;
|
||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xsyr2k<float>;
|
||||
template class Xsyr2k<double>;
|
||||
template class Xsyr2k<float2>;
|
||||
template class Xsyr2k<double2>;
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
150
src/routines/xsyrk.cc
Normal file
150
src/routines/xsyrk.cc
Normal file
|
@ -0,0 +1,150 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Xsyrk class (see the header for information about the class).
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "internal/routines/xsyrk.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
|
||||
template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
|
||||
Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// The main routine
|
||||
template <typename T>
|
||||
StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const T alpha,
|
||||
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const T beta,
|
||||
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
|
||||
|
||||
// Makes sure all dimensions are larger than zero
|
||||
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
|
||||
|
||||
// Computes whether or not the matrices are transposed in memory. This is based on their layout
|
||||
// (row or column-major) and whether or not they are requested to be pre-transposed.
|
||||
auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
|
||||
(layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
|
||||
auto c_rotated = (layout == Layout::kRowMajor);
|
||||
|
||||
// In case of complex data-types, the transpose can also become a conjugate transpose
|
||||
auto a_conjugate = (a_transpose == Transpose::kConjugate);
|
||||
|
||||
// Computes the first and second dimensions of the A matrix taking the layout into account
|
||||
auto a_one = (a_rotated) ? k : n;
|
||||
auto a_two = (a_rotated) ? n : k;
|
||||
|
||||
// Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
|
||||
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
|
||||
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
|
||||
// space. Also tests that the leading dimensions of:
|
||||
// matrix A cannot be less than N when rotated, or less than K when not-rotated
|
||||
// matrix C cannot be less than N
|
||||
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Calculates the ceiled versions of n and k
|
||||
auto n_ceiled = Ceil(n, db_["NWG"]);
|
||||
auto k_ceiled = Ceil(k, db_["KWG"]);
|
||||
|
||||
// Decides which kernel to run: the upper-triangular or lower-triangular version
|
||||
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
|
||||
|
||||
// Allocates space on the device for padded and/or transposed input and output matrices.
|
||||
try {
|
||||
auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||
auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
|
||||
|
||||
// Loads the program from the database
|
||||
auto& program = GetProgramFromCache();
|
||||
|
||||
// Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
|
||||
// fill them up until they reach a certain multiple of size (kernel parameter dependent).
|
||||
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
|
||||
n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
|
||||
a_rotated, a_conjugate, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
|
||||
// modify the other triangle.
|
||||
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
|
||||
n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
|
||||
c_rotated, false, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
|
||||
try {
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(2, alpha);
|
||||
kernel.SetArgument(3, beta);
|
||||
kernel.SetArgument(4, temp_a());
|
||||
kernel.SetArgument(5, temp_a());
|
||||
kernel.SetArgument(6, temp_c());
|
||||
|
||||
// Computes the global and local thread sizes
|
||||
auto global = std::vector<size_t>{
|
||||
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
|
||||
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
|
||||
};
|
||||
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
|
||||
|
||||
// Launches the kernel
|
||||
status = RunKernel(kernel, global, local);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Runs the post-processing kernel
|
||||
auto upper = (triangle == Triangle::kUpper);
|
||||
auto lower = (triangle == Triangle::kLower);
|
||||
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
|
||||
n, n, c_ld, c_offset, c_buffer,
|
||||
c_rotated, false, false, upper, lower, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Successfully finished the computation
|
||||
return StatusCode::kSuccess;
|
||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xsyrk<float>;
|
||||
template class Xsyrk<double>;
|
||||
template class Xsyrk<float2>;
|
||||
template class Xsyrk<double2>;
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
|
@ -76,7 +76,7 @@ void XgemmTest(int argc, char *argv[], const bool silent, const std::string &nam
|
|||
const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
|
||||
|
||||
// Runs the tests
|
||||
tester.TestRegular(args, case_name);
|
||||
tester.TestRegular(args, case_name, false);
|
||||
tester.TestInvalidBufferSizes(args, case_name);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -76,7 +76,7 @@ void XsymmTest(int argc, char *argv[], const bool silent, const std::string &nam
|
|||
const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
|
||||
|
||||
// Runs the tests
|
||||
tester.TestRegular(args, case_name);
|
||||
tester.TestRegular(args, case_name, true);
|
||||
tester.TestInvalidBufferSizes(args, case_name);
|
||||
}
|
||||
}
|
||||
|
|
99
test/correctness/routines/xsyr2k.cc
Normal file
99
test/correctness/routines/xsyr2k.cc
Normal file
|
@ -0,0 +1,99 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the tests for the Xsyr2k routine. It is based on the TestABC class.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "wrapper_clblas.h"
|
||||
#include "correctness/testabc.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
|
||||
template <typename T>
|
||||
void Xsyr2kTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||
|
||||
// Creates the CLBlast lambda
|
||||
auto clblast_lambda = [](const Arguments<T> &args,
|
||||
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
|
||||
CommandQueue &queue) -> StatusCode {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
return Syr2k(args.layout, args.triangle, args.a_transpose,
|
||||
args.n, args.k,
|
||||
args.alpha,
|
||||
a_mat(), args.a_offset, args.a_ld,
|
||||
b_mat(), args.b_offset, args.b_ld,
|
||||
args.beta,
|
||||
c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
};
|
||||
|
||||
// Creates the clBLAS lambda (for comparison)
|
||||
auto clblas_lambda = [](const Arguments<T> &args,
|
||||
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
|
||||
CommandQueue &queue) -> StatusCode {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
|
||||
static_cast<clblasUplo>(args.triangle),
|
||||
static_cast<clblasTranspose>(args.a_transpose),
|
||||
args.n, args.k,
|
||||
args.alpha,
|
||||
a_mat(), args.a_offset, args.a_ld,
|
||||
b_mat(), args.b_offset, args.b_ld,
|
||||
args.beta,
|
||||
c_mat(), args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
};
|
||||
|
||||
// Initializes the arguments relevant for this routine
|
||||
auto args = Arguments<T>{};
|
||||
const auto options = std::vector<std::string>{kArgN, kArgK, kArgLayout,
|
||||
kArgTriangle, kArgATransp,
|
||||
kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
|
||||
kArgAOffset, kArgBOffset, kArgCOffset};
|
||||
|
||||
// Creates a tester
|
||||
TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
|
||||
|
||||
// Loops over the test-cases from a data-layout point of view
|
||||
for (auto &layout: tester.kLayouts) {
|
||||
args.layout = layout;
|
||||
for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
|
||||
args.triangle = triangle;
|
||||
for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is
|
||||
args.a_transpose = ab_transpose; // not supported by clBLAS
|
||||
args.b_transpose = ab_transpose;
|
||||
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
|
||||
|
||||
// Runs the tests
|
||||
tester.TestRegular(args, case_name, true);
|
||||
tester.TestInvalidBufferSizes(args, case_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
clblast::Xsyr2kTest<float>(argc, argv, false, "SSYR2K");
|
||||
clblast::Xsyr2kTest<double>(argc, argv, true, "DSYR2K");
|
||||
clblast::Xsyr2kTest<clblast::float2>(argc, argv, true, "CSYR2K");
|
||||
clblast::Xsyr2kTest<clblast::double2>(argc, argv, true, "ZSYR2K");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
96
test/correctness/routines/xsyrk.cc
Normal file
96
test/correctness/routines/xsyrk.cc
Normal file
|
@ -0,0 +1,96 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the tests for the Xsyrk routine. It is based on the TestAC class.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "wrapper_clblas.h"
|
||||
#include "correctness/testac.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
|
||||
template <typename T>
|
||||
void XsyrkTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||
|
||||
// Creates the CLBlast lambda
|
||||
auto clblast_lambda = [](const Arguments<T> &args,
|
||||
const Buffer &a_mat, const Buffer &c_mat,
|
||||
CommandQueue &queue) -> StatusCode {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
return Syrk(args.layout, args.triangle, args.a_transpose,
|
||||
args.n, args.k,
|
||||
args.alpha,
|
||||
a_mat(), args.a_offset, args.a_ld,
|
||||
args.beta,
|
||||
c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
};
|
||||
|
||||
// Creates the clBLAS lambda (for comparison)
|
||||
auto clblas_lambda = [](const Arguments<T> &args,
|
||||
const Buffer &a_mat, const Buffer &c_mat,
|
||||
CommandQueue &queue) -> StatusCode {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
|
||||
static_cast<clblasUplo>(args.triangle),
|
||||
static_cast<clblasTranspose>(args.a_transpose),
|
||||
args.n, args.k,
|
||||
args.alpha,
|
||||
a_mat(), args.a_offset, args.a_ld,
|
||||
args.beta,
|
||||
c_mat(), args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
};
|
||||
|
||||
// Initializes the arguments relevant for this routine
|
||||
auto args = Arguments<T>{};
|
||||
const auto options = std::vector<std::string>{kArgN, kArgK, kArgLayout,
|
||||
kArgTriangle, kArgATransp,
|
||||
kArgALeadDim, kArgCLeadDim,
|
||||
kArgAOffset, kArgCOffset};
|
||||
|
||||
// Creates a tester
|
||||
TestAC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
|
||||
|
||||
// Loops over the test-cases from a data-layout point of view
|
||||
for (auto &layout: tester.kLayouts) {
|
||||
args.layout = layout;
|
||||
for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
|
||||
args.triangle = triangle;
|
||||
for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is
|
||||
args.a_transpose = a_transpose; // not supported by clBLAS
|
||||
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
|
||||
|
||||
// Runs the tests
|
||||
tester.TestRegular(args, case_name);
|
||||
tester.TestInvalidBufferSizes(args, case_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
clblast::XsyrkTest<float>(argc, argv, false, "SSYRK");
|
||||
clblast::XsyrkTest<double>(argc, argv, true, "DSYRK");
|
||||
clblast::XsyrkTest<clblast::float2>(argc, argv, true, "CSYRK");
|
||||
clblast::XsyrkTest<clblast::double2>(argc, argv, true, "ZSYRK");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
192
test/correctness/testab.cc
Normal file
192
test/correctness/testab.cc
Normal file
|
@ -0,0 +1,192 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the TestAB class (see the header for information about the class).
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "correctness/testab.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor, initializes the base class tester and input data
|
||||
template <typename T>
|
||||
TestAB<T>::TestAB(int argc, char *argv[], const bool silent,
|
||||
const std::string &name, const std::vector<std::string> &options,
|
||||
const Routine clblast_lambda, const Routine clblas_lambda):
|
||||
Tester<T>{argc, argv, silent, name, options},
|
||||
clblast_lambda_(clblast_lambda),
|
||||
clblas_lambda_(clblas_lambda) {
|
||||
|
||||
// Computes the maximum sizes. This allows for a single set of input/output buffers.
|
||||
auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
|
||||
auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
|
||||
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
|
||||
|
||||
// Creates test input data
|
||||
a_source_.resize(max_dim*max_ld + max_offset);
|
||||
b_source_.resize(max_dim*max_ld + max_offset);
|
||||
PopulateVector(a_source_);
|
||||
PopulateVector(b_source_);
|
||||
}
|
||||
|
||||
// ===============================================================================================
|
||||
|
||||
// Tests the routine for a wide variety of parameters
|
||||
template <typename T>
|
||||
void TestAB<T>::TestRegular(Arguments<T> &args, const std::string &name) {
|
||||
if (!PrecisionSupported()) { return; }
|
||||
TestStart("regular behaviour", name);
|
||||
|
||||
// Computes whether or not the matrices are transposed. Note that we assume a default of
|
||||
// column-major and no-transpose. If one of them is different (but not both), then rotated
|
||||
// is considered true.
|
||||
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
|
||||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
|
||||
auto b_rotated = (args.layout == Layout::kRowMajor);
|
||||
|
||||
// Iterates over the matrix dimensions
|
||||
for (auto &m: kMatrixDims) {
|
||||
args.m = m;
|
||||
for (auto &n: kMatrixDims) {
|
||||
args.n = n;
|
||||
|
||||
// Computes the second dimensions of the matrices taking the rotation into account
|
||||
auto a_two = (a_rotated) ? n : n;
|
||||
auto b_two = (b_rotated) ? m : n;
|
||||
|
||||
// Iterates over the leading-dimension values and the offsets
|
||||
for (auto &a_ld: kMatrixDims) {
|
||||
args.a_ld = a_ld;
|
||||
for (auto &a_offset: kOffsets) {
|
||||
args.a_offset = a_offset;
|
||||
for (auto &b_ld: kMatrixDims) {
|
||||
args.b_ld = b_ld;
|
||||
for (auto &b_offset: kOffsets) {
|
||||
args.b_offset = b_offset;
|
||||
|
||||
// Computes the buffer sizes
|
||||
auto a_size = a_two * a_ld + a_offset;
|
||||
auto b_size = b_two * b_ld + b_offset;
|
||||
if (a_size < 1 || b_size < 1) { continue; }
|
||||
|
||||
// Creates the OpenCL buffers
|
||||
auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
|
||||
auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
|
||||
auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
|
||||
|
||||
// Iterates over the values for alpha and beta
|
||||
for (auto &alpha: kAlphaValues) {
|
||||
args.alpha = alpha;
|
||||
for (auto &beta: kBetaValues) {
|
||||
args.beta = beta;
|
||||
|
||||
// Runs the reference clBLAS code
|
||||
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
|
||||
r_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
|
||||
auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
|
||||
|
||||
// Runs the CLBlast code
|
||||
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
|
||||
s_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
|
||||
auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
|
||||
|
||||
// Tests for equality of the two status codes
|
||||
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
|
||||
TestErrorCodes(status1, status2, args);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Downloads the results
|
||||
std::vector<T> r_result(b_size, static_cast<T>(0));
|
||||
std::vector<T> s_result(b_size, static_cast<T>(0));
|
||||
r_mat.ReadBuffer(queue_, b_size*sizeof(T), r_result);
|
||||
s_mat.ReadBuffer(queue_, b_size*sizeof(T), s_result);
|
||||
|
||||
// Checks for differences in the output
|
||||
auto errors = size_t{0};
|
||||
for (auto idm=size_t{0}; idm<m; ++idm) {
|
||||
for (auto idn=size_t{0}; idn<n; ++idn) {
|
||||
auto index = (args.layout == Layout::kRowMajor) ?
|
||||
idm*args.b_ld + idn + args.b_offset:
|
||||
idn*args.b_ld + idm + args.b_offset;
|
||||
if (!TestSimilarity(r_result[index], s_result[index])) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tests the error count (should be zero)
|
||||
TestErrorCount(errors, m*n, args);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
TestEnd();
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
|
||||
// does not test for results (if any).
|
||||
template <typename T>
|
||||
void TestAB<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
|
||||
if (!PrecisionSupported()) { return; }
|
||||
TestStart("invalid buffer sizes", name);
|
||||
|
||||
// Sets example test parameters
|
||||
args.m = kBufferSize;
|
||||
args.n = kBufferSize;
|
||||
args.a_ld = kBufferSize;
|
||||
args.b_ld = kBufferSize;
|
||||
args.a_offset = 0;
|
||||
args.b_offset = 0;
|
||||
|
||||
// Iterates over test buffer sizes
|
||||
const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
|
||||
for (auto &a_size: kBufferSizes) {
|
||||
for (auto &b_size: kBufferSizes) {
|
||||
|
||||
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
|
||||
// want to be able to create invalid buffers (no error checking here).
|
||||
auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
|
||||
auto a_mat = Buffer(a);
|
||||
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
|
||||
auto r_mat = Buffer(r);
|
||||
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
|
||||
auto s_mat = Buffer(s);
|
||||
|
||||
// Runs the two routines
|
||||
auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
|
||||
auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
|
||||
|
||||
// Tests for equality of the two status codes
|
||||
TestErrorCodes(status1, status2, args);
|
||||
}
|
||||
}
|
||||
TestEnd();
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class TestAB<float>;
|
||||
template class TestAB<double>;
|
||||
template class TestAB<float2>;
|
||||
template class TestAB<double2>;
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
85
test/correctness/testab.h
Normal file
85
test/correctness/testab.h
Normal file
|
@ -0,0 +1,85 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file tests any mat-mat (A,B) routine. It contains two types of tests: one testing
|
||||
// all sorts of input combinations, and one deliberatly testing with invalid values.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_TEST_CORRECTNESS_TESTAB_H_
|
||||
#define CLBLAST_TEST_CORRECTNESS_TESTAB_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "correctness/tester.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class TestAB: public Tester<T> {
|
||||
public:
|
||||
|
||||
// Uses several variables from the Tester class
|
||||
using Tester<T>::context_;
|
||||
using Tester<T>::queue_;
|
||||
using Tester<T>::kLayouts;
|
||||
using Tester<T>::kTransposes;
|
||||
|
||||
// Uses several helper functions from the Tester class
|
||||
using Tester<T>::TestStart;
|
||||
using Tester<T>::TestEnd;
|
||||
using Tester<T>::TestSimilarity;
|
||||
using Tester<T>::TestErrorCount;
|
||||
using Tester<T>::TestErrorCodes;
|
||||
using Tester<T>::GetExampleScalars;
|
||||
using Tester<T>::GetOffsets;
|
||||
using Tester<T>::PrecisionSupported;
|
||||
|
||||
// Test settings for the regular test. Append to this list in case more tests are required.
|
||||
const std::vector<size_t> kMatrixDims = { 7, 64 };
|
||||
const std::vector<size_t> kOffsets = GetOffsets();
|
||||
const std::vector<T> kAlphaValues = GetExampleScalars();
|
||||
const std::vector<T> kBetaValues = GetExampleScalars();
|
||||
|
||||
// Test settings for the invalid test
|
||||
const size_t kBufferSize = 64;
|
||||
|
||||
// Shorthand for a BLAS routine
|
||||
using Routine = std::function<StatusCode(const Arguments<T>&,
|
||||
const Buffer&, const Buffer&,
|
||||
CommandQueue&)>;
|
||||
|
||||
// Constructor, initializes the base class tester and input data
|
||||
TestAB(int argc, char *argv[], const bool silent,
|
||||
const std::string &name, const std::vector<std::string> &options,
|
||||
const Routine clblast_lambda, const Routine clblas_lambda);
|
||||
|
||||
// The test functions, taking no inputs
|
||||
void TestRegular(Arguments<T> &args, const std::string &name);
|
||||
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
|
||||
|
||||
private:
|
||||
|
||||
// Source data to test with
|
||||
std::vector<T> a_source_;
|
||||
std::vector<T> b_source_;
|
||||
|
||||
// The routines to test
|
||||
Routine clblast_lambda_;
|
||||
Routine clblas_lambda_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_TEST_CORRECTNESS_TESTAB_H_
|
||||
#endif
|
|
@ -45,7 +45,7 @@ TestABC<T>::TestABC(int argc, char *argv[], const bool silent,
|
|||
|
||||
// Tests the routine for a wide variety of parameters
|
||||
template <typename T>
|
||||
void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
|
||||
void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name, const bool symmetric) {
|
||||
if (!PrecisionSupported()) { return; }
|
||||
TestStart("regular behaviour", name);
|
||||
|
||||
|
@ -63,6 +63,7 @@ void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
|
|||
args.m = m;
|
||||
for (auto &n: kMatrixDims) {
|
||||
args.n = n;
|
||||
if (symmetric && m != n) { continue; }
|
||||
for (auto &k: kMatrixDims) {
|
||||
args.k = k;
|
||||
|
||||
|
|
|
@ -64,7 +64,7 @@ class TestABC: public Tester<T> {
|
|||
const Routine clblast_lambda, const Routine clblas_lambda);
|
||||
|
||||
// The test functions, taking no inputs
|
||||
void TestRegular(Arguments<T> &args, const std::string &name);
|
||||
void TestRegular(Arguments<T> &args, const std::string &name, const bool symmetric);
|
||||
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
|
||||
|
||||
private:
|
||||
|
|
191
test/correctness/testac.cc
Normal file
191
test/correctness/testac.cc
Normal file
|
@ -0,0 +1,191 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the TestAC class (see the header for information about the class).
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "correctness/testac.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor, initializes the base class tester and input data
|
||||
template <typename T>
|
||||
TestAC<T>::TestAC(int argc, char *argv[], const bool silent,
|
||||
const std::string &name, const std::vector<std::string> &options,
|
||||
const Routine clblast_lambda, const Routine clblas_lambda):
|
||||
Tester<T>{argc, argv, silent, name, options},
|
||||
clblast_lambda_(clblast_lambda),
|
||||
clblas_lambda_(clblas_lambda) {
|
||||
|
||||
// Computes the maximum sizes. This allows for a single set of input/output buffers.
|
||||
auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
|
||||
auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
|
||||
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
|
||||
|
||||
// Creates test input data
|
||||
a_source_.resize(max_dim*max_ld + max_offset);
|
||||
c_source_.resize(max_dim*max_ld + max_offset);
|
||||
PopulateVector(a_source_);
|
||||
PopulateVector(c_source_);
|
||||
}
|
||||
|
||||
// ===============================================================================================
|
||||
|
||||
// Tests the routine for a wide variety of parameters
|
||||
template <typename T>
|
||||
void TestAC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
|
||||
if (!PrecisionSupported()) { return; }
|
||||
TestStart("regular behaviour", name);
|
||||
|
||||
// Computes whether or not the matrices are transposed. Note that we assume a default of
|
||||
// column-major and no-transpose. If one of them is different (but not both), then rotated
|
||||
// is considered true.
|
||||
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
|
||||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
|
||||
auto c_rotated = (args.layout == Layout::kRowMajor);
|
||||
|
||||
// Iterates over the matrix dimensions
|
||||
for (auto &n: kMatrixDims) {
|
||||
args.n = n;
|
||||
for (auto &k: kMatrixDims) {
|
||||
args.k = k;
|
||||
|
||||
// Computes the second dimensions of the matrices taking the rotation into account
|
||||
auto a_two = (a_rotated) ? n : k;
|
||||
auto c_two = (c_rotated) ? n : n;
|
||||
|
||||
// Iterates over the leading-dimension values and the offsets
|
||||
for (auto &a_ld: kMatrixDims) {
|
||||
args.a_ld = a_ld;
|
||||
for (auto &a_offset: kOffsets) {
|
||||
args.a_offset = a_offset;
|
||||
for (auto &c_ld: kMatrixDims) {
|
||||
args.c_ld = c_ld;
|
||||
for (auto &c_offset: kOffsets) {
|
||||
args.c_offset = c_offset;
|
||||
|
||||
// Computes the buffer sizes
|
||||
auto a_size = a_two * a_ld + a_offset;
|
||||
auto c_size = c_two * c_ld + c_offset;
|
||||
if (a_size < 1 || c_size < 1) { continue; }
|
||||
|
||||
// Creates the OpenCL buffers
|
||||
auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
|
||||
auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
|
||||
auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
|
||||
|
||||
// Iterates over the values for alpha and beta
|
||||
for (auto &alpha: kAlphaValues) {
|
||||
args.alpha = alpha;
|
||||
for (auto &beta: kBetaValues) {
|
||||
args.beta = beta;
|
||||
|
||||
// Runs the reference clBLAS code
|
||||
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
|
||||
r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
|
||||
auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
|
||||
|
||||
// Runs the CLBlast code
|
||||
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
|
||||
s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
|
||||
auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
|
||||
|
||||
// Tests for equality of the two status codes
|
||||
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
|
||||
TestErrorCodes(status1, status2, args);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Downloads the results
|
||||
std::vector<T> r_result(c_size, static_cast<T>(0));
|
||||
std::vector<T> s_result(c_size, static_cast<T>(0));
|
||||
r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
|
||||
s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
|
||||
|
||||
// Checks for differences in the output
|
||||
auto errors = size_t{0};
|
||||
for (auto idn0=size_t{0}; idn0<n; ++idn0) {
|
||||
for (auto idn1=size_t{0}; idn1<n; ++idn1) {
|
||||
auto index = idn0*args.c_ld + idn1 + args.c_offset;
|
||||
if (!TestSimilarity(r_result[index], s_result[index])) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tests the error count (should be zero)
|
||||
TestErrorCount(errors, n*n, args);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
TestEnd();
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
|
||||
// does not test for results (if any).
|
||||
template <typename T>
|
||||
void TestAC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
|
||||
if (!PrecisionSupported()) { return; }
|
||||
TestStart("invalid buffer sizes", name);
|
||||
|
||||
// Sets example test parameters
|
||||
args.m = kBufferSize;
|
||||
args.n = kBufferSize;
|
||||
args.k = kBufferSize;
|
||||
args.a_ld = kBufferSize;
|
||||
args.c_ld = kBufferSize;
|
||||
args.a_offset = 0;
|
||||
args.c_offset = 0;
|
||||
|
||||
// Iterates over test buffer sizes
|
||||
const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
|
||||
for (auto &a_size: kBufferSizes) {
|
||||
for (auto &c_size: kBufferSizes) {
|
||||
|
||||
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
|
||||
// want to be able to create invalid buffers (no error checking here).
|
||||
auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
|
||||
auto a_mat = Buffer(a);
|
||||
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
|
||||
auto r_mat = Buffer(r);
|
||||
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
|
||||
auto s_mat = Buffer(s);
|
||||
|
||||
// Runs the two routines
|
||||
auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
|
||||
auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
|
||||
|
||||
// Tests for equality of the two status codes
|
||||
TestErrorCodes(status1, status2, args);
|
||||
}
|
||||
}
|
||||
TestEnd();
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class TestAC<float>;
|
||||
template class TestAC<double>;
|
||||
template class TestAC<float2>;
|
||||
template class TestAC<double2>;
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
85
test/correctness/testac.h
Normal file
85
test/correctness/testac.h
Normal file
|
@ -0,0 +1,85 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file tests any mat-mat (A,C) routine. It contains two types of tests: one testing
|
||||
// all sorts of input combinations, and one deliberatly testing with invalid values.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_TEST_CORRECTNESS_TESTAC_H_
|
||||
#define CLBLAST_TEST_CORRECTNESS_TESTAC_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "correctness/tester.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class TestAC: public Tester<T> {
|
||||
public:
|
||||
|
||||
// Uses several variables from the Tester class
|
||||
using Tester<T>::context_;
|
||||
using Tester<T>::queue_;
|
||||
using Tester<T>::kLayouts;
|
||||
using Tester<T>::kTransposes;
|
||||
|
||||
// Uses several helper functions from the Tester class
|
||||
using Tester<T>::TestStart;
|
||||
using Tester<T>::TestEnd;
|
||||
using Tester<T>::TestSimilarity;
|
||||
using Tester<T>::TestErrorCount;
|
||||
using Tester<T>::TestErrorCodes;
|
||||
using Tester<T>::GetExampleScalars;
|
||||
using Tester<T>::GetOffsets;
|
||||
using Tester<T>::PrecisionSupported;
|
||||
|
||||
// Test settings for the regular test. Append to this list in case more tests are required.
|
||||
const std::vector<size_t> kMatrixDims = { 7, 64 };
|
||||
const std::vector<size_t> kOffsets = GetOffsets();
|
||||
const std::vector<T> kAlphaValues = GetExampleScalars();
|
||||
const std::vector<T> kBetaValues = GetExampleScalars();
|
||||
|
||||
// Test settings for the invalid test
|
||||
const size_t kBufferSize = 64;
|
||||
|
||||
// Shorthand for a BLAS routine
|
||||
using Routine = std::function<StatusCode(const Arguments<T>&,
|
||||
const Buffer&, const Buffer&,
|
||||
CommandQueue&)>;
|
||||
|
||||
// Constructor, initializes the base class tester and input data
|
||||
TestAC(int argc, char *argv[], const bool silent,
|
||||
const std::string &name, const std::vector<std::string> &options,
|
||||
const Routine clblast_lambda, const Routine clblas_lambda);
|
||||
|
||||
// The test functions, taking no inputs
|
||||
void TestRegular(Arguments<T> &args, const std::string &name);
|
||||
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
|
||||
|
||||
private:
|
||||
|
||||
// Source data to test with
|
||||
std::vector<T> a_source_;
|
||||
std::vector<T> c_source_;
|
||||
|
||||
// The routines to test
|
||||
Routine clblast_lambda_;
|
||||
Routine clblas_lambda_;
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_TEST_CORRECTNESS_TESTAC_H_
|
||||
#endif
|
|
@ -115,7 +115,7 @@ void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
|
|||
auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
|
||||
|
||||
// Computes the vector sizes in case the matrix is transposed
|
||||
auto a_transposed = (args.a_transpose == Transpose::kYes);
|
||||
auto a_transposed = (args.a_transpose != Transpose::kNo);
|
||||
auto m_real = (a_transposed) ? args.n : args.m;
|
||||
auto n_real = (a_transposed) ? args.m : args.n;
|
||||
|
||||
|
@ -163,13 +163,13 @@ template void ClientAXY<double2>(int, char **, Routine3<double2>, const std::vec
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
|
||||
// This is the matrix-matrix variant of the set-up/tear-down client routine.
|
||||
template <typename T>
|
||||
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
|
||||
const std::vector<std::string> &options) {
|
||||
void ClientAC(int argc, char *argv[], Routine2<T> client_routine,
|
||||
const std::vector<std::string> &options) {
|
||||
|
||||
// Function to determine how to find the default value of the leading dimension of matrix A
|
||||
auto default_ld_a = [](const Arguments<T> args) { return args.m; };
|
||||
auto default_ld_a = [](const Arguments<T> args) { return args.k; };
|
||||
|
||||
// Simple command line argument parser with defaults
|
||||
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
|
||||
|
@ -188,9 +188,83 @@ void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
|
|||
// Computes whether or not the matrices are transposed. Note that we assume a default of
|
||||
// column-major and no-transpose. If one of them is different (but not both), then rotated
|
||||
// is considered true.
|
||||
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) ||
|
||||
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
|
||||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
|
||||
auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) ||
|
||||
|
||||
// Iterates over all "num_step" values jumping by "step" each time
|
||||
auto s = size_t{0};
|
||||
while(true) {
|
||||
|
||||
// Computes the data sizes
|
||||
auto a_two = (a_rotated) ? args.n : args.k;
|
||||
auto a_size = a_two * args.a_ld + args.a_offset;
|
||||
auto c_size = args.n * args.c_ld + args.c_offset;
|
||||
|
||||
// Populates input host matrices with random data
|
||||
std::vector<T> a_source(a_size);
|
||||
std::vector<T> c_source(c_size);
|
||||
PopulateVector(a_source);
|
||||
PopulateVector(c_source);
|
||||
|
||||
// Creates the matrices on the device
|
||||
auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
|
||||
auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
|
||||
a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
|
||||
c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
|
||||
|
||||
// Runs the routine-specific code
|
||||
client_routine(args, a_buffer, c_buffer, queue);
|
||||
|
||||
// Makes the jump to the next step
|
||||
++s;
|
||||
if (s >= args.num_steps) { break; }
|
||||
args.n += args.step;
|
||||
args.k += args.step;
|
||||
args.a_ld += args.step;
|
||||
args.c_ld += args.step;
|
||||
}
|
||||
|
||||
// Cleans-up and returns
|
||||
if (args.compare_clblas) { clblasTeardown(); }
|
||||
}
|
||||
|
||||
// Compiles the above function
|
||||
template void ClientAC<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
|
||||
template void ClientAC<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
|
||||
template void ClientAC<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
|
||||
template void ClientAC<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
|
||||
template <typename T>
|
||||
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
|
||||
const std::vector<std::string> &options, const bool symmetric) {
|
||||
|
||||
// Function to determine how to find the default value of the leading dimension of matrix A
|
||||
auto default_ld_a = [&symmetric](const Arguments<T> args) { return (symmetric) ? args.n : args.m; };
|
||||
|
||||
// Simple command line argument parser with defaults
|
||||
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
|
||||
if (args.print_help) { return; }
|
||||
if (symmetric) { args.m = args.n; }
|
||||
|
||||
// Prints the header of the output table
|
||||
PrintTableHeader(args.silent, options);
|
||||
|
||||
// Initializes OpenCL and the libraries
|
||||
auto platform = Platform(args.platform_id);
|
||||
auto device = Device(platform, kDeviceType, args.device_id);
|
||||
auto context = Context(device);
|
||||
auto queue = CommandQueue(context, device);
|
||||
if (args.compare_clblas) { clblasSetup(); }
|
||||
|
||||
// Computes whether or not the matrices are transposed. Note that we assume a default of
|
||||
// column-major and no-transpose. If one of them is different (but not both), then rotated
|
||||
// is considered true.
|
||||
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
|
||||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
|
||||
auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
|
||||
(args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
|
||||
auto c_rotated = (args.layout == Layout::kRowMajor);
|
||||
|
||||
|
@ -241,10 +315,10 @@ void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
|
|||
}
|
||||
|
||||
// Compiles the above function
|
||||
template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
|
||||
template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
|
||||
template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
|
||||
template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
|
||||
template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&, const bool);
|
||||
template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&, const bool);
|
||||
template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&, const bool);
|
||||
template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&, const bool);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
|
|
@ -52,8 +52,11 @@ template <typename T>
|
|||
void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
|
||||
const std::vector<std::string> &options);
|
||||
template <typename T>
|
||||
void ClientAC(int argc, char *argv[], Routine2<T> client_routine,
|
||||
const std::vector<std::string> &options);
|
||||
template <typename T>
|
||||
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
|
||||
const std::vector<std::string> &options);
|
||||
const std::vector<std::string> &options, const bool symmetric);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
|
94
test/performance/graphs/xsyr2k.r
Normal file
94
test/performance/graphs/xsyr2k.r
Normal file
|
@ -0,0 +1,94 @@
|
|||
|
||||
# ==================================================================================================
|
||||
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
|
||||
#
|
||||
# Author(s):
|
||||
# Cedric Nugteren <www.cedricnugteren.nl>
|
||||
#
|
||||
# This file implements the performance script for the Xsyr2k routine
|
||||
#
|
||||
# ==================================================================================================
|
||||
|
||||
# Includes the common functions
|
||||
args <- commandArgs(trailingOnly = FALSE)
|
||||
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
|
||||
source(file.path(dirname(thisfile), "common.r"))
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
# Settings
|
||||
routine_name <- "xsyr2k"
|
||||
parameters <- c("-n","-k","-layout","-triangle","-transA",
|
||||
"-num_steps","-step","-runs","-precision")
|
||||
precision <- 32
|
||||
|
||||
# Sets the names of the test-cases
|
||||
test_names <- list(
|
||||
"multiples of 128",
|
||||
"multiples of 128 (+1)",
|
||||
"around n=k=512",
|
||||
"around n=k=1536",
|
||||
"layouts and transposing (n=k=1024)",
|
||||
"powers of 2"
|
||||
)
|
||||
|
||||
# Defines the test-cases
|
||||
test_values <- list(
|
||||
list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
|
||||
list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
|
||||
list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
|
||||
list(c(1536, 1536, 0, 0, 0, 16, 1, num_runs, precision)),
|
||||
list(
|
||||
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
|
||||
),
|
||||
list(
|
||||
c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
|
||||
)
|
||||
)
|
||||
|
||||
# Defines the x-labels corresponding to the test-cases
|
||||
test_xlabels <- list(
|
||||
"matrix sizes (n=k)",
|
||||
"matrix sizes (n=k)",
|
||||
"matrix sizes (n=k)",
|
||||
"matrix sizes (n=k)",
|
||||
"layout (row/col), triangle (u/l), transA (n/y)",
|
||||
"matrix sizes (n=k)"
|
||||
)
|
||||
|
||||
# Defines the x-axis of the test-cases
|
||||
test_xaxis <- list(
|
||||
c("n", ""),
|
||||
c("n", ""),
|
||||
c("n", ""),
|
||||
c("n", ""),
|
||||
list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
|
||||
"col,u,n", "col,u,y", "col,l,n", "col,l,y")),
|
||||
c("n", "x")
|
||||
)
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
# Start the script
|
||||
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
|
||||
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
|
||||
|
||||
# ==================================================================================================
|
94
test/performance/graphs/xsyrk.r
Normal file
94
test/performance/graphs/xsyrk.r
Normal file
|
@ -0,0 +1,94 @@
|
|||
|
||||
# ==================================================================================================
|
||||
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
|
||||
#
|
||||
# Author(s):
|
||||
# Cedric Nugteren <www.cedricnugteren.nl>
|
||||
#
|
||||
# This file implements the performance script for the Xsyrk routine
|
||||
#
|
||||
# ==================================================================================================
|
||||
|
||||
# Includes the common functions
|
||||
args <- commandArgs(trailingOnly = FALSE)
|
||||
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
|
||||
source(file.path(dirname(thisfile), "common.r"))
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
# Settings
|
||||
routine_name <- "xsyrk"
|
||||
parameters <- c("-n","-k","-layout","-triangle","-transA",
|
||||
"-num_steps","-step","-runs","-precision")
|
||||
precision <- 32
|
||||
|
||||
# Sets the names of the test-cases
|
||||
test_names <- list(
|
||||
"multiples of 128",
|
||||
"multiples of 128 (+1)",
|
||||
"around n=k=512",
|
||||
"around n=k=2048",
|
||||
"layouts and transposing (n=k=1024)",
|
||||
"powers of 2"
|
||||
)
|
||||
|
||||
# Defines the test-cases
|
||||
test_values <- list(
|
||||
list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
|
||||
list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
|
||||
list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
|
||||
list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
|
||||
list(
|
||||
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
|
||||
),
|
||||
list(
|
||||
c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
|
||||
c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
|
||||
)
|
||||
)
|
||||
|
||||
# Defines the x-labels corresponding to the test-cases
|
||||
test_xlabels <- list(
|
||||
"matrix sizes (n=k)",
|
||||
"matrix sizes (n=k)",
|
||||
"matrix sizes (n=k)",
|
||||
"matrix sizes (n=k)",
|
||||
"layout (row/col), triangle (u/l), transA (n/y)",
|
||||
"matrix sizes (n=k)"
|
||||
)
|
||||
|
||||
# Defines the x-axis of the test-cases
|
||||
test_xaxis <- list(
|
||||
c("n", ""),
|
||||
c("n", ""),
|
||||
c("n", ""),
|
||||
c("n", ""),
|
||||
list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
|
||||
"col,u,n", "col,u,y", "col,l,n", "col,l,y")),
|
||||
c("n", "x")
|
||||
)
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
# Start the script
|
||||
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
|
||||
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
|
||||
|
||||
# ==================================================================================================
|
|
@ -72,7 +72,7 @@ void PerformanceXgemm(const Arguments<T> &args,
|
|||
|
||||
// Prints the performance of both libraries
|
||||
const auto flops = 2 * args.m * args.n * args.k;
|
||||
const auto bytes = (args.m*args.k + args.k*args.n + args.m*args.n) * sizeof(T);
|
||||
const auto bytes = (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T);
|
||||
const auto output_ints = std::vector<size_t>{args.m, args.n, args.k,
|
||||
static_cast<size_t>(args.layout),
|
||||
static_cast<size_t>(args.a_transpose),
|
||||
|
@ -96,10 +96,10 @@ void ClientXgemm(int argc, char *argv[]) {
|
|||
kArgAlpha, kArgBeta};
|
||||
switch(GetPrecision(argc, argv)) {
|
||||
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXgemm<float>, o); break;
|
||||
case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXgemm<double>, o); break;
|
||||
case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode");
|
||||
case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode");
|
||||
case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXgemm<float>, o, false); break;
|
||||
case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXgemm<double>, o, false); break;
|
||||
case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXgemm<float2>, o, false); break;
|
||||
case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXgemm<double2>, o, false); break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -72,7 +72,7 @@ void PerformanceXsymm(const Arguments<T> &args,
|
|||
|
||||
// Prints the performance of both libraries
|
||||
const auto flops = 2 * args.m * args.n * args.m;
|
||||
const auto bytes = (args.m*args.m + args.m*args.n + args.m*args.n) * sizeof(T);
|
||||
const auto bytes = (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
|
||||
const auto output_ints = std::vector<size_t>{args.m, args.n,
|
||||
static_cast<size_t>(args.layout),
|
||||
static_cast<size_t>(args.triangle),
|
||||
|
@ -96,10 +96,10 @@ void ClientXsymm(int argc, char *argv[]) {
|
|||
kArgAlpha, kArgBeta};
|
||||
switch(GetPrecision(argc, argv)) {
|
||||
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsymm<float>, o); break;
|
||||
case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsymm<double>, o); break;
|
||||
case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode");
|
||||
case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode");
|
||||
case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsymm<float>, o, false); break;
|
||||
case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsymm<double>, o, false); break;
|
||||
case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXsymm<float2>, o, false); break;
|
||||
case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXsymm<double2>, o, false); break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
115
test/performance/routines/xsyr2k.cc
Normal file
115
test/performance/routines/xsyr2k.cc
Normal file
|
@ -0,0 +1,115 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Xsyr2k command-line interface tester.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <exception>
|
||||
|
||||
#include "wrapper_clblas.h"
|
||||
#include "performance/client.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// The client, used for performance testing. It contains the function calls to CLBlast and to other
|
||||
// libraries to compare against.
|
||||
template <typename T>
|
||||
void PerformanceXsyr2k(const Arguments<T> &args,
|
||||
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
|
||||
CommandQueue &queue) {
|
||||
|
||||
// Creates the CLBlast lambda
|
||||
auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
|
||||
args.n, args.k,
|
||||
args.alpha,
|
||||
a_mat(), args.a_offset, args.a_ld,
|
||||
b_mat(), args.b_offset, args.b_ld,
|
||||
args.beta,
|
||||
c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
if (status != StatusCode::kSuccess) {
|
||||
throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
|
||||
}
|
||||
};
|
||||
|
||||
// Creates the clBLAS lambda (for comparison)
|
||||
auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
|
||||
static_cast<clblasUplo>(args.triangle),
|
||||
static_cast<clblasTranspose>(args.a_transpose),
|
||||
args.n, args.k,
|
||||
args.alpha,
|
||||
a_mat(), args.a_offset, args.a_ld,
|
||||
b_mat(), args.b_offset, args.b_ld,
|
||||
args.beta,
|
||||
c_mat(), args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
if (status != CL_SUCCESS) {
|
||||
throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
|
||||
}
|
||||
};
|
||||
|
||||
// Runs the routines and collect the timings
|
||||
auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
|
||||
auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
|
||||
|
||||
// Prints the performance of both libraries
|
||||
const auto flops = 2 * args.n * args.n * args.k;
|
||||
const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T);
|
||||
const auto output_ints = std::vector<size_t>{args.n, args.k,
|
||||
static_cast<size_t>(args.layout),
|
||||
static_cast<size_t>(args.triangle),
|
||||
static_cast<size_t>(args.a_transpose),
|
||||
args.a_ld, args.b_ld, args.c_ld,
|
||||
args.a_offset, args.b_offset, args.c_offset};
|
||||
const auto output_strings = std::vector<std::string>{ToString(args.alpha),
|
||||
ToString(args.beta)};
|
||||
PrintTableRow(output_ints, output_strings, args.no_abbrv,
|
||||
ms_clblast, ms_clblas, flops, bytes);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Main function which calls the common client code with the routine-specific function as argument.
|
||||
void ClientXsyr2k(int argc, char *argv[]) {
|
||||
const auto o = std::vector<std::string>{kArgN, kArgK,
|
||||
kArgLayout, kArgTriangle, kArgATransp,
|
||||
kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
|
||||
kArgAOffset, kArgBOffset, kArgCOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
switch(GetPrecision(argc, argv)) {
|
||||
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsyr2k<float>, o, true); break;
|
||||
case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsyr2k<double>, o, true); break;
|
||||
case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXsyr2k<float2>, o, true); break;
|
||||
case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXsyr2k<double2>, o, true); break;
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
clblast::ClientXsyr2k(argc, argv);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
113
test/performance/routines/xsyrk.cc
Normal file
113
test/performance/routines/xsyrk.cc
Normal file
|
@ -0,0 +1,113 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Xsyrk command-line interface tester.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <exception>
|
||||
|
||||
#include "wrapper_clblas.h"
|
||||
#include "performance/client.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// The client, used for performance testing. It contains the function calls to CLBlast and to other
|
||||
// libraries to compare against.
|
||||
template <typename T>
|
||||
void PerformanceXsyrk(const Arguments<T> &args,
|
||||
const Buffer &a_mat, const Buffer &c_mat,
|
||||
CommandQueue &queue) {
|
||||
|
||||
// Creates the CLBlast lambda
|
||||
auto clblast_lambda = [&args, &a_mat, &c_mat, &queue]() {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Syrk(args.layout, args.triangle, args.a_transpose,
|
||||
args.n, args.k,
|
||||
args.alpha,
|
||||
a_mat(), args.a_offset, args.a_ld,
|
||||
args.beta,
|
||||
c_mat(), args.c_offset, args.c_ld,
|
||||
&queue_plain, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
if (status != StatusCode::kSuccess) {
|
||||
throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
|
||||
}
|
||||
};
|
||||
|
||||
// Creates the clBLAS lambda (for comparison)
|
||||
auto clblas_lambda = [&args, &a_mat, &c_mat, &queue]() {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
|
||||
static_cast<clblasUplo>(args.triangle),
|
||||
static_cast<clblasTranspose>(args.a_transpose),
|
||||
args.n, args.k,
|
||||
args.alpha,
|
||||
a_mat(), args.a_offset, args.a_ld,
|
||||
args.beta,
|
||||
c_mat(), args.c_offset, args.c_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
if (status != CL_SUCCESS) {
|
||||
throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
|
||||
}
|
||||
};
|
||||
|
||||
// Runs the routines and collect the timings
|
||||
auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
|
||||
auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
|
||||
|
||||
// Prints the performance of both libraries
|
||||
const auto flops = args.n * args.n * args.k;
|
||||
const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T);
|
||||
const auto output_ints = std::vector<size_t>{args.n, args.k,
|
||||
static_cast<size_t>(args.layout),
|
||||
static_cast<size_t>(args.triangle),
|
||||
static_cast<size_t>(args.a_transpose),
|
||||
args.a_ld, args.c_ld,
|
||||
args.a_offset, args.c_offset};
|
||||
const auto output_strings = std::vector<std::string>{ToString(args.alpha),
|
||||
ToString(args.beta)};
|
||||
PrintTableRow(output_ints, output_strings, args.no_abbrv,
|
||||
ms_clblast, ms_clblas, flops, bytes);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Main function which calls the common client code with the routine-specific function as argument.
|
||||
void ClientXsyrk(int argc, char *argv[]) {
|
||||
const auto o = std::vector<std::string>{kArgN, kArgK,
|
||||
kArgLayout, kArgTriangle, kArgATransp,
|
||||
kArgALeadDim, kArgCLeadDim,
|
||||
kArgAOffset, kArgCOffset,
|
||||
kArgAlpha, kArgBeta};
|
||||
switch(GetPrecision(argc, argv)) {
|
||||
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case Precision::kSingle: ClientAC<float>(argc, argv, PerformanceXsyrk<float>, o); break;
|
||||
case Precision::kDouble: ClientAC<double>(argc, argv, PerformanceXsyrk<double>, o); break;
|
||||
case Precision::kComplexSingle: ClientAC<float2>(argc, argv, PerformanceXsyrk<float2>, o); break;
|
||||
case Precision::kComplexDouble: ClientAC<double2>(argc, argv, PerformanceXsyrk<double2>, o); break;
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
clblast::ClientXsyrk(argc, argv);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
|
@ -201,7 +201,7 @@ clblasStatus clblasXgemm(
|
|||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
|
||||
// This calls {clblasSsymm, clblasDsymm} with the arguments forwarded.
|
||||
// This calls {clblasSsymm, clblasDsymm, clblasCsymm, clblasZsymm} with the arguments forwarded.
|
||||
clblasStatus clblasXsymm(
|
||||
clblasOrder layout, clblasSide side, clblasUplo triangle,
|
||||
size_t m, size_t n, float alpha,
|
||||
|
@ -267,6 +267,130 @@ clblasStatus clblasXsymm(
|
|||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
|
||||
// This calls {clblasSsyrk, clblasDsyrk, clblasCsyrk, clblasZsyrk} with the arguments forwarded.
|
||||
clblasStatus clblasXsyrk(
|
||||
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
|
||||
size_t n, size_t k, float alpha,
|
||||
const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta,
|
||||
cl_mem c_mat, size_t c_offset, size_t c_ld,
|
||||
cl_uint num_queues, cl_command_queue *queues,
|
||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||
return clblasSsyrk(layout, triangle, tran_a,
|
||||
n, k, alpha,
|
||||
a_mat, a_offset, a_ld, beta,
|
||||
c_mat, c_offset, c_ld,
|
||||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
clblasStatus clblasXsyrk(
|
||||
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
|
||||
size_t n, size_t k, double alpha,
|
||||
const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta,
|
||||
cl_mem c_mat, size_t c_offset, size_t c_ld,
|
||||
cl_uint num_queues, cl_command_queue *queues,
|
||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||
return clblasDsyrk(layout, triangle, tran_a,
|
||||
n, k, alpha,
|
||||
a_mat, a_offset, a_ld, beta,
|
||||
c_mat, c_offset, c_ld,
|
||||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
clblasStatus clblasXsyrk(
|
||||
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
|
||||
size_t n, size_t k, float2 alpha,
|
||||
const cl_mem a_mat, size_t a_offset, size_t a_ld, float2 beta,
|
||||
cl_mem c_mat, size_t c_offset, size_t c_ld,
|
||||
cl_uint num_queues, cl_command_queue *queues,
|
||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||
auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
|
||||
auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
|
||||
return clblasCsyrk(layout, triangle, tran_a,
|
||||
n, k, cl_alpha,
|
||||
a_mat, a_offset, a_ld, cl_beta,
|
||||
c_mat, c_offset, c_ld,
|
||||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
clblasStatus clblasXsyrk(
|
||||
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
|
||||
size_t n, size_t k, double2 alpha,
|
||||
const cl_mem a_mat, size_t a_offset, size_t a_ld, double2 beta,
|
||||
cl_mem c_mat, size_t c_offset, size_t c_ld,
|
||||
cl_uint num_queues, cl_command_queue *queues,
|
||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||
auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
|
||||
auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
|
||||
return clblasZsyrk(layout, triangle, tran_a,
|
||||
n, k, cl_alpha,
|
||||
a_mat, a_offset, a_ld, cl_beta,
|
||||
c_mat, c_offset, c_ld,
|
||||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
|
||||
// This calls {clblasSsyr2k, clblasDsyr2k, clblasCsyr2k, clblasZsyr2k} with the arguments forwarded.
|
||||
clblasStatus clblasXsyr2k(
|
||||
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
|
||||
size_t n, size_t k, float alpha,
|
||||
const cl_mem a_mat, size_t a_offset, size_t a_ld,
|
||||
const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta,
|
||||
cl_mem c_mat, size_t c_offset, size_t c_ld,
|
||||
cl_uint num_queues, cl_command_queue *queues,
|
||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||
return clblasSsyr2k(layout, triangle, tran_ab,
|
||||
n, k, alpha,
|
||||
a_mat, a_offset, a_ld,
|
||||
b_mat, b_offset, b_ld, beta,
|
||||
c_mat, c_offset, c_ld,
|
||||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
clblasStatus clblasXsyr2k(
|
||||
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
|
||||
size_t n, size_t k, double alpha,
|
||||
const cl_mem a_mat, size_t a_offset, size_t a_ld,
|
||||
const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta,
|
||||
cl_mem c_mat, size_t c_offset, size_t c_ld,
|
||||
cl_uint num_queues, cl_command_queue *queues,
|
||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||
return clblasDsyr2k(layout, triangle, tran_ab,
|
||||
n, k, alpha,
|
||||
a_mat, a_offset, a_ld,
|
||||
b_mat, b_offset, b_ld, beta,
|
||||
c_mat, c_offset, c_ld,
|
||||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
clblasStatus clblasXsyr2k(
|
||||
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
|
||||
size_t n, size_t k, float2 alpha,
|
||||
const cl_mem a_mat, size_t a_offset, size_t a_ld,
|
||||
const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta,
|
||||
cl_mem c_mat, size_t c_offset, size_t c_ld,
|
||||
cl_uint num_queues, cl_command_queue *queues,
|
||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||
auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
|
||||
auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
|
||||
return clblasCsyr2k(layout, triangle, tran_ab,
|
||||
n, k, cl_alpha,
|
||||
a_mat, a_offset, a_ld,
|
||||
b_mat, b_offset, b_ld, cl_beta,
|
||||
c_mat, c_offset, c_ld,
|
||||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
clblasStatus clblasXsyr2k(
|
||||
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
|
||||
size_t n, size_t k, double2 alpha,
|
||||
const cl_mem a_mat, size_t a_offset, size_t a_ld,
|
||||
const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta,
|
||||
cl_mem c_mat, size_t c_offset, size_t c_ld,
|
||||
cl_uint num_queues, cl_command_queue *queues,
|
||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||
auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
|
||||
auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
|
||||
return clblasZsyr2k(layout, triangle, tran_ab,
|
||||
n, k, cl_alpha,
|
||||
a_mat, a_offset, a_ld,
|
||||
b_mat, b_offset, b_ld, cl_beta,
|
||||
c_mat, c_offset, c_ld,
|
||||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
|
|
Loading…
Reference in a new issue