Merge pull request #9 from CNugteren/level3_routines

Added SYRK and SYR2K level-3 routines
This commit is contained in:
Cedric Nugteren 2015-06-26 20:56:21 +02:00
commit 77e2157485
37 changed files with 2263 additions and 193 deletions

View file

@ -1,4 +1,9 @@
Development version (next release)
- Added level-3 routines:
* SSYRK/DSYRK/CSYRK/ZSYRK
* SSYR2K/DSYR2K/CSYR2K/ZSYR2K
Version 0.2.0
- Added support for complex conjugate transpose
- Several host-code performance improvements

View file

@ -97,8 +97,10 @@ set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
set(SAMPLE_PROGRAMS sgemm)
set(ROUTINES_XY xaxpy)
set(ROUTINES_AXY xgemv)
set(ROUTINES_ABC xgemm xsymm)
set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC})
set(ROUTINES_ABC xgemm xsymm xsyr2k)
set(ROUTINES_AB )
set(ROUTINES_AC xsyrk)
set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC} ${ROUTINES_AB} ${ROUTINES_AC})
# ==================================================================================================
@ -172,6 +174,8 @@ if(TESTS)
add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
add_library(test_correctness_ab OBJECT test/correctness/testab.cc)
add_library(test_correctness_ac OBJECT test/correctness/testac.cc)
# Compiles the correctness-tests
foreach(ROUTINE ${ROUTINES_XY})
@ -179,22 +183,32 @@ if(TESTS)
$<TARGET_OBJECTS:test_correctness_common>
$<TARGET_OBJECTS:test_correctness_xy>
test/correctness/routines/${ROUTINE}.cc)
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
endforeach()
foreach(ROUTINE ${ROUTINES_AXY})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>
$<TARGET_OBJECTS:test_correctness_axy>
test/correctness/routines/${ROUTINE}.cc)
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
endforeach()
foreach(ROUTINE ${ROUTINES_ABC})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>
$<TARGET_OBJECTS:test_correctness_abc>
test/correctness/routines/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES_AB})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>
$<TARGET_OBJECTS:test_correctness_ab>
test/correctness/routines/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES_AC})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>
$<TARGET_OBJECTS:test_correctness_ac>
test/correctness/routines/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
endforeach()

View file

@ -124,7 +124,7 @@ These graphs can be generated automatically on your own device. First, compile C
Supported routines
-------------
CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with `x` in the following tables:
CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
| Level-1 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
@ -135,7 +135,7 @@ CLBlast is in active development and currently does not support the full set of
| xSWAP | | | | | |
| xSCAL | | | | | +CS +ZD |
| xCOPY | | | | | |
| xAXPY |`x`|`x`|`x`|`x`| |
| xAXPY | ✔ | ✔ | ✔ | ✔ | |
| xDOT | | | - | - | +DS |
| xDOTU | - | - | | | |
| xDOTC | - | - | | | |
@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of
| Level-2 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMV |`x`|`x`|`x`|`x`| |
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
| xGBMV | | | | | |
| xHEMV | - | - | | | |
| xHBMV | - | - | | | |
@ -175,12 +175,12 @@ CLBlast is in active development and currently does not support the full set of
| Level-3 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMM |`x`|`x`|`x`|`x`| |
| xSYMM |`x`|`x`|`x`|`x`| |
| xGEMM | ✔ | ✔ | ✔ | ✔ | |
| xSYMM | ✔ | ✔ | ✔ | ✔ | |
| xHEMM | - | - | | | |
| xSYRK | | | | | |
| xSYRK | ✔ | ✔ | ✔ | ✔ | |
| xHERK | - | - | | | |
| xSYR2K | | | | | |
| xSYR2K | ✔ | ✔ | ✔ | ✔ | |
| xHER2K | - | - | | | |
| xTRMM | | | | | |
| xTRSM | | | | | |

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -107,7 +107,7 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM
// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
template <typename T>
StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
const size_t m, const size_t n, const size_t k,
@ -118,7 +118,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM
// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
template <typename T>
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
@ -129,6 +129,27 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
template <typename T>
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose transpose_a,
const size_t n, const size_t k,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
template <typename T>
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose transpose_ab,
const size_t n, const size_t k,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// =================================================================================================
} // namespace clblast

View file

@ -92,7 +92,8 @@ class Routine {
const size_t dest_ld, const size_t dest_offset,
const Buffer &dest,
const bool do_transpose, const bool do_conjugate,
const bool pad, const Program &program);
const bool pad, const bool upper, const bool lower,
const Program &program);
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
// The first assumes that the program is available in the cache and will throw an exception

View file

@ -0,0 +1,48 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
// The implementation is very similar to Xsyrk (see header for details), except for the fact that
// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XSYR2K_H_
#define CLBLAST_ROUTINES_XSYR2K_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xsyr2k: public Routine {
public:
Xsyr2k(CommandQueue &queue, Event &event);
// Templated-precision implementation of the routine
StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XSYR2K_H_
#endif

View file

@ -0,0 +1,49 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyrk routine. The precision is implemented using a template argument.
// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
// performance reasons, as the actual masking is done later (see the first point).
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XSYRK_H_
#define CLBLAST_ROUTINES_XSYRK_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xsyrk: public Routine {
public:
Xsyrk(CommandQueue &queue, Event &event);
// Templated-precision implementation of the routine
StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XSYRK_H_
#endif

View file

@ -26,6 +26,8 @@
// BLAS level-3 includes
#include "internal/routines/xgemm.h"
#include "internal/routines/xsymm.h"
#include "internal/routines/xsyrk.h"
#include "internal/routines/xsyr2k.h"
namespace clblast {
// =================================================================================================
@ -209,7 +211,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
std::string kernel_source =
#include "kernels/xgemm.opencl"
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
kernel_source);
kernel_source);
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
@ -243,5 +245,119 @@ template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// SYRK
template <typename T>
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose transpose_a,
const size_t n, const size_t k, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto routine = Xsyrk<T>(queue_cpp, event_cpp);
// Loads the kernel source-code as an include (C++11 raw string literal)
std::string common_source1 =
#include "kernels/copy.opencl"
std::string common_source2 =
#include "kernels/pad.opencl"
std::string common_source3 =
#include "kernels/transpose.opencl"
std::string common_source4 =
#include "kernels/padtranspose.opencl"
std::string kernel_source =
#include "kernels/xgemm.opencl"
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
kernel_source);
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoSyrk(layout, triangle, transpose_a, n, k, alpha,
Buffer(a_buffer), a_offset, a_ld, beta,
Buffer(c_buffer), c_offset, c_ld);
}
template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float,
const cl_mem, const size_t, const size_t, const float,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syrk<double>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const double,
const cl_mem, const size_t, const size_t, const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syrk<float2>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t, const float2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syrk<double2>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t, const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// SYR2K
template <typename T>
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose transpose_ab,
const size_t n, const size_t k, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto routine = Xsyr2k<T>(queue_cpp, event_cpp);
// Loads the kernel source-code as an include (C++11 raw string literal)
std::string common_source1 =
#include "kernels/copy.opencl"
std::string common_source2 =
#include "kernels/pad.opencl"
std::string common_source3 =
#include "kernels/transpose.opencl"
std::string common_source4 =
#include "kernels/padtranspose.opencl"
std::string kernel_source =
#include "kernels/xgemm.opencl"
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
kernel_source);
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoSyr2k(layout, triangle, transpose_ab, n, k, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld, beta,
Buffer(c_buffer), c_offset, c_ld);
}
template StatusCode Syr2k<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const float,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syr2k<double>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const double,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syr2k<float2>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const float2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syr2k<double2>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
} // namespace clblast

View file

@ -86,7 +86,8 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest) {
__global real* dest,
const int upper, const int lower) {
// Loops over the work per thread in both dimensions
#pragma unroll
@ -95,11 +96,18 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
#pragma unroll
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_two && id_one < dest_one) {
// Masking in case of triangular matrices: updates only the upper or lower part
bool condition = true;
if (upper == 1) { condition = (id_two >= id_one); }
else if (lower == 1) { condition = (id_two <= id_one); }
if (condition) {
// Copies the value into the destination matrix. This is always within bounds of the source
// matrix, as we know that the destination matrix is smaller than the source.
dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
if (id_two < dest_two && id_one < dest_one) {
dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
}
}
}
}

View file

@ -100,7 +100,8 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest) {
__global real* dest,
const int upper, const int lower) {
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@ -137,10 +138,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
// Stores the transposed value in the destination matrix
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
// Masking in case of triangular matrices: updates only the upper or lower part
bool condition = true;
if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
if (condition) {
// Stores the transposed value in the destination matrix
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
}
}
}
}

View file

@ -127,6 +127,55 @@ R"(
// =================================================================================================
// Initializes the accumulation registers to zero
inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#pragma unroll
for (int ni=0; ni<NWI; ++ni) {
#if VWM == 1
SetToZero(cpm[ni][mi]);
#elif VWM == 2
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
#elif VWM == 4
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
SetToZero(cpm[ni][mi].z);
SetToZero(cpm[ni][mi].w);
#elif VWM == 8
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
#elif VWM == 16
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
SetToZero(cpm[ni][mi].s8);
SetToZero(cpm[ni][mi].s9);
SetToZero(cpm[ni][mi].sA);
SetToZero(cpm[ni][mi].sB);
SetToZero(cpm[ni][mi].sC);
SetToZero(cpm[ni][mi].sD);
SetToZero(cpm[ni][mi].sE);
SetToZero(cpm[ni][mi].sF);
#endif
}
}
}
// =================================================================================================
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix.
#if SA == 1
@ -272,71 +321,6 @@ inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg
// =================================================================================================
// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
const real alpha, const real beta) {
#pragma unroll
for (int ni=0; ni<NWI; ++ni) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#if STRM == 0
int mg = mi + get_local_id(0)*(MWI/VWM);
#elif STRM == 1
int mg = get_local_id(0) + mi*MDIMC;
#endif
#if STRN == 0
int ng = ni + get_local_id(1)*NWI;
#elif STRN == 1
int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
#endif
int idm = mg + get_group_id(0)*(MWG/VWM);
int idn = ng + get_group_id(1)*NWG;
int index = idn*(kSizeM/VWM) + idm;
realM cval = cgm[index];
#if VWM == 1
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
#elif VWM == 2
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
#elif VWM == 4
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
#elif VWM == 8
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
#elif VWM == 16
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
#endif
}
}
}
// =================================================================================================
// The vectorised multiply-add function
inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
#if USE_VECTOR_MAD == 1
@ -432,77 +416,97 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real
// =================================================================================================
// Main entry of the kernel. This function contains the basic skeleton, the functionality is
// provided by the inlined functions above
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
const real alpha, const real beta) {
#pragma unroll
for (int ni=0; ni<NWI; ++ni) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#if STRM == 0
int mg = mi + get_local_id(0)*(MWI/VWM);
#elif STRM == 1
int mg = get_local_id(0) + mi*MDIMC;
#endif
#if STRN == 0
int ng = ni + get_local_id(1)*NWI;
#elif STRN == 1
int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
#endif
int idm = mg + get_group_id(0)*(MWG/VWM);
int idn = ng + get_group_id(1)*NWG;
// Combined thread identifier
// The final multiplication with alpha and the addition with beta*C
int index = idn*(kSizeM/VWM) + idm;
realM cval = cgm[index];
#if VWM == 1
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
#elif VWM == 2
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
#elif VWM == 4
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
#elif VWM == 8
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
#elif VWM == 16
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
#endif
}
}
}
// =================================================================================================
// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
const __global realM* restrict agm, const __global realN* restrict bgm,
__global realM* cgm, realM cpm[NWI][MWI/VWM],
#if SA == 1 && SB == 1
__local realM* alm, __local realN* blm
#elif SA == 1
__local realM* alm
#elif SB == 1
__local realN* blm
#endif
) {
// Allocates workitem-private memory (registers)
realM apm[MWI/VWM];
realN bpm[NWI/VWN];
// Combined thread identifier (volatile to disable caching)
#if SA == 1 || SB == 1
volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
#endif
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Allocates workitem-private memory (registers)
realM apm[MWI/VWM];
realN bpm[NWI/VWN];
realM cpm[NWI][MWI/VWM];
// Initializes the accumulation registers
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#pragma unroll
for (int ni=0; ni<NWI; ++ni) {
#if VWM == 1
SetToZero(cpm[ni][mi]);
#elif VWM == 2
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
#elif VWM == 4
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
SetToZero(cpm[ni][mi].z);
SetToZero(cpm[ni][mi].w);
#elif VWM == 8
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
#elif VWM == 16
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
SetToZero(cpm[ni][mi].s8);
SetToZero(cpm[ni][mi].s9);
SetToZero(cpm[ni][mi].sA);
SetToZero(cpm[ni][mi].sB);
SetToZero(cpm[ni][mi].sC);
SetToZero(cpm[ni][mi].sD);
SetToZero(cpm[ni][mi].sE);
SetToZero(cpm[ni][mi].sF);
#endif
}
}
InitAccRegisters(cpm);
// Loops over all workgroup tiles
for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
@ -515,8 +519,6 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
#if SB == 1
GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
#endif
// Synchronizes all threads in a workgroup
#if SA == 1 || SB == 1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
@ -552,19 +554,126 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
MultiplyAccumulate(cpm, apm, bpm);
}
}
// Synchronizes all threads in a workgroup
#if SA == 1 || SB == 1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
}
}
// Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
// =================================================================================================
// Main entry point of the kernel. This is the regular full version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeM, alpha, beta);
}
// =================================================================================================
// Main entry point of the kernel. This is the upper-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
// Skip these threads if they do not contain threads contributing to the upper-triangle
if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
return;
}
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeN, alpha, beta);
}
// =================================================================================================
// Main entry point of the kernel. This is the lower-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void XgemmLower(const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
// Skip these threads if they do not contain threads contributing to the lower-triangle
if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
return;
}
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeN, alpha, beta);
}
// =================================================================================================
// End of the C++11 raw string literal
)";

View file

@ -210,11 +210,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
const size_t dest_ld, const size_t dest_offset,
const Buffer &dest,
const bool do_transpose, const bool do_conjugate,
const bool pad, const Program &program) {
const bool pad, const bool upper, const bool lower,
const Program &program) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
(upper == false) && (lower == false);
// Determines the right kernel
auto kernel_name = std::string{};
@ -267,6 +269,10 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
if (pad) {
kernel.SetArgument(10, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(10, static_cast<int>(upper));
kernel.SetArgument(11, static_cast<int>(lower));
}
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on

View file

@ -108,18 +108,18 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// them up until they reach a certain multiple of size (kernel parameter dependent).
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
a_do_transpose, a_conjugate, true, program);
a_do_transpose, a_conjugate, true, false, false, program);
if (ErrorIn(status)) { return status; }
status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
b_do_transpose, b_conjugate, true, program);
b_do_transpose, b_conjugate, true, false, false, program);
if (ErrorIn(status)) { return status; }
// Only necessary for matrix C if it used both as input and output
if (beta != static_cast<T>(0)) {
status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
c_do_transpose, false, true, program);
c_do_transpose, false, true, false, false, program);
if (ErrorIn(status)) { return status; }
}
@ -151,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// Runs the post-processing kernel
status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
c_one, c_two, c_ld, c_offset, c_buffer,
c_do_transpose, false, false, program);
c_do_transpose, false, false, false, false, program);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation

169
src/routines/xsyr2k.cc Normal file
View file

@ -0,0 +1,169 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyr2k class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/xsyr2k.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
(layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
auto c_rotated = (layout == Layout::kRowMajor);
// In case of complex data-types, the transpose can also become a conjugate transpose
auto ab_conjugate = (ab_transpose == Transpose::kConjugate);
// Computes the first and second dimensions of the A and B matrices taking the layout into account
auto ab_one = (ab_rotated) ? k : n;
auto ab_two = (ab_rotated) ? n : k;
// Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(n, db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// Allocates space on the device for padded and/or transposed input and output matrices.
try {
auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
// Loads the program from the database
auto& program = GetProgramFromCache();
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// fill them up until they reach a certain multiple of size (kernel parameter dependent).
status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
ab_rotated, ab_conjugate, true, false, false, program);
if (ErrorIn(status)) { return status; }
status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
ab_rotated, ab_conjugate, true, false, false, program);
if (ErrorIn(status)) { return status; }
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
c_rotated, false, true, false, false, program);
if (ErrorIn(status)) { return status; }
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, beta);
kernel.SetArgument(4, temp_a());
kernel.SetArgument(5, temp_b());
kernel.SetArgument(6, temp_c());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
auto one = static_cast<T>(1);
kernel.SetArgument(3, one);
kernel.SetArgument(4, temp_b());
kernel.SetArgument(5, temp_a());
// Runs the kernel again
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
n, n, c_ld, c_offset, c_buffer,
c_rotated, false, false, upper, lower, program);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
// =================================================================================================
// Compiles the templated class
template class Xsyr2k<float>;
template class Xsyr2k<double>;
template class Xsyr2k<float2>;
template class Xsyr2k<double2>;
// =================================================================================================
} // namespace clblast

150
src/routines/xsyrk.cc Normal file
View file

@ -0,0 +1,150 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyrk class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/xsyrk.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
(layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
auto c_rotated = (layout == Layout::kRowMajor);
// In case of complex data-types, the transpose can also become a conjugate transpose
auto a_conjugate = (a_transpose == Transpose::kConjugate);
// Computes the first and second dimensions of the A matrix taking the layout into account
auto a_one = (a_rotated) ? k : n;
auto a_two = (a_rotated) ? n : k;
// Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(n, db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// Allocates space on the device for padded and/or transposed input and output matrices.
try {
auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
// Loads the program from the database
auto& program = GetProgramFromCache();
// Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
// fill them up until they reach a certain multiple of size (kernel parameter dependent).
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
a_rotated, a_conjugate, true, false, false, program);
if (ErrorIn(status)) { return status; }
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
c_rotated, false, true, false, false, program);
if (ErrorIn(status)) { return status; }
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, beta);
kernel.SetArgument(4, temp_a());
kernel.SetArgument(5, temp_a());
kernel.SetArgument(6, temp_c());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
n, n, c_ld, c_offset, c_buffer,
c_rotated, false, false, upper, lower, program);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
// =================================================================================================
// Compiles the templated class
template class Xsyrk<float>;
template class Xsyrk<double>;
template class Xsyrk<float2>;
template class Xsyrk<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -76,7 +76,7 @@ void XgemmTest(int argc, char *argv[], const bool silent, const std::string &nam
const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
// Runs the tests
tester.TestRegular(args, case_name);
tester.TestRegular(args, case_name, false);
tester.TestInvalidBufferSizes(args, case_name);
}
}

View file

@ -76,7 +76,7 @@ void XsymmTest(int argc, char *argv[], const bool silent, const std::string &nam
const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
// Runs the tests
tester.TestRegular(args, case_name);
tester.TestRegular(args, case_name, true);
tester.TestInvalidBufferSizes(args, case_name);
}
}

View file

@ -0,0 +1,99 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xsyr2k routine. It is based on the TestABC class.
//
// =================================================================================================
#include "wrapper_clblas.h"
#include "correctness/testabc.h"
namespace clblast {
// =================================================================================================
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
template <typename T>
void Xsyr2kTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates the CLBlast lambda
auto clblast_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
return Syr2k(args.layout, args.triangle, args.a_transpose,
args.n, args.k,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
b_mat(), args.b_offset, args.b_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
};
// Creates the clBLAS lambda (for comparison)
auto clblas_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
b_mat(), args.b_offset, args.b_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
return static_cast<StatusCode>(status);
};
// Initializes the arguments relevant for this routine
auto args = Arguments<T>{};
const auto options = std::vector<std::string>{kArgN, kArgK, kArgLayout,
kArgTriangle, kArgATransp,
kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
kArgAOffset, kArgBOffset, kArgCOffset};
// Creates a tester
TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) {
args.layout = layout;
for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
args.triangle = triangle;
for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is
args.a_transpose = ab_transpose; // not supported by clBLAS
args.b_transpose = ab_transpose;
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
// Runs the tests
tester.TestRegular(args, case_name, true);
tester.TestInvalidBufferSizes(args, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::Xsyr2kTest<float>(argc, argv, false, "SSYR2K");
clblast::Xsyr2kTest<double>(argc, argv, true, "DSYR2K");
clblast::Xsyr2kTest<clblast::float2>(argc, argv, true, "CSYR2K");
clblast::Xsyr2kTest<clblast::double2>(argc, argv, true, "ZSYR2K");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,96 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xsyrk routine. It is based on the TestAC class.
//
// =================================================================================================
#include "wrapper_clblas.h"
#include "correctness/testac.h"
namespace clblast {
// =================================================================================================
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
template <typename T>
void XsyrkTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates the CLBlast lambda
auto clblast_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &c_mat,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
return Syrk(args.layout, args.triangle, args.a_transpose,
args.n, args.k,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
};
// Creates the clBLAS lambda (for comparison)
auto clblas_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &c_mat,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
return static_cast<StatusCode>(status);
};
// Initializes the arguments relevant for this routine
auto args = Arguments<T>{};
const auto options = std::vector<std::string>{kArgN, kArgK, kArgLayout,
kArgTriangle, kArgATransp,
kArgALeadDim, kArgCLeadDim,
kArgAOffset, kArgCOffset};
// Creates a tester
TestAC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) {
args.layout = layout;
for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
args.triangle = triangle;
for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it is
args.a_transpose = a_transpose; // not supported by clBLAS
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
// Runs the tests
tester.TestRegular(args, case_name);
tester.TestInvalidBufferSizes(args, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::XsyrkTest<float>(argc, argv, false, "SSYRK");
clblast::XsyrkTest<double>(argc, argv, true, "DSYRK");
clblast::XsyrkTest<clblast::float2>(argc, argv, true, "CSYRK");
clblast::XsyrkTest<clblast::double2>(argc, argv, true, "ZSYRK");
return 0;
}
// =================================================================================================

192
test/correctness/testab.cc Normal file
View file

@ -0,0 +1,192 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the TestAB class (see the header for information about the class).
//
// =================================================================================================
#include <algorithm>
#include "correctness/testab.h"
namespace clblast {
// =================================================================================================
// Constructor, initializes the base class tester and input data
template <typename T>
TestAB<T>::TestAB(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda):
Tester<T>{argc, argv, silent, name, options},
clblast_lambda_(clblast_lambda),
clblas_lambda_(clblas_lambda) {
// Computes the maximum sizes. This allows for a single set of input/output buffers.
auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
// Creates test input data
a_source_.resize(max_dim*max_ld + max_offset);
b_source_.resize(max_dim*max_ld + max_offset);
PopulateVector(a_source_);
PopulateVector(b_source_);
}
// ===============================================================================================
// Tests the routine for a wide variety of parameters
template <typename T>
void TestAB<T>::TestRegular(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("regular behaviour", name);
// Computes whether or not the matrices are transposed. Note that we assume a default of
// column-major and no-transpose. If one of them is different (but not both), then rotated
// is considered true.
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
auto b_rotated = (args.layout == Layout::kRowMajor);
// Iterates over the matrix dimensions
for (auto &m: kMatrixDims) {
args.m = m;
for (auto &n: kMatrixDims) {
args.n = n;
// Computes the second dimensions of the matrices taking the rotation into account
auto a_two = (a_rotated) ? n : n;
auto b_two = (b_rotated) ? m : n;
// Iterates over the leading-dimension values and the offsets
for (auto &a_ld: kMatrixDims) {
args.a_ld = a_ld;
for (auto &a_offset: kOffsets) {
args.a_offset = a_offset;
for (auto &b_ld: kMatrixDims) {
args.b_ld = b_ld;
for (auto &b_offset: kOffsets) {
args.b_offset = b_offset;
// Computes the buffer sizes
auto a_size = a_two * a_ld + a_offset;
auto b_size = b_two * b_ld + b_offset;
if (a_size < 1 || b_size < 1) { continue; }
// Creates the OpenCL buffers
auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
// Iterates over the values for alpha and beta
for (auto &alpha: kAlphaValues) {
args.alpha = alpha;
for (auto &beta: kBetaValues) {
args.beta = beta;
// Runs the reference clBLAS code
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
r_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
// Runs the CLBlast code
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
s_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
// Tests for equality of the two status codes
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
TestErrorCodes(status1, status2, args);
continue;
}
// Downloads the results
std::vector<T> r_result(b_size, static_cast<T>(0));
std::vector<T> s_result(b_size, static_cast<T>(0));
r_mat.ReadBuffer(queue_, b_size*sizeof(T), r_result);
s_mat.ReadBuffer(queue_, b_size*sizeof(T), s_result);
// Checks for differences in the output
auto errors = size_t{0};
for (auto idm=size_t{0}; idm<m; ++idm) {
for (auto idn=size_t{0}; idn<n; ++idn) {
auto index = (args.layout == Layout::kRowMajor) ?
idm*args.b_ld + idn + args.b_offset:
idn*args.b_ld + idm + args.b_offset;
if (!TestSimilarity(r_result[index], s_result[index])) {
errors++;
}
}
}
// Tests the error count (should be zero)
TestErrorCount(errors, m*n, args);
}
}
}
}
}
}
}
}
TestEnd();
}
// =================================================================================================
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
// does not test for results (if any).
template <typename T>
void TestAB<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("invalid buffer sizes", name);
// Sets example test parameters
args.m = kBufferSize;
args.n = kBufferSize;
args.a_ld = kBufferSize;
args.b_ld = kBufferSize;
args.a_offset = 0;
args.b_offset = 0;
// Iterates over test buffer sizes
const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
for (auto &a_size: kBufferSizes) {
for (auto &b_size: kBufferSizes) {
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
// want to be able to create invalid buffers (no error checking here).
auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
auto a_mat = Buffer(a);
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
auto r_mat = Buffer(r);
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
auto s_mat = Buffer(s);
// Runs the two routines
auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
// Tests for equality of the two status codes
TestErrorCodes(status1, status2, args);
}
}
TestEnd();
}
// =================================================================================================
// Compiles the templated class
template class TestAB<float>;
template class TestAB<double>;
template class TestAB<float2>;
template class TestAB<double2>;
// =================================================================================================
} // namespace clblast

85
test/correctness/testab.h Normal file
View file

@ -0,0 +1,85 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file tests any mat-mat (A,B) routine. It contains two types of tests: one testing
// all sorts of input combinations, and one deliberatly testing with invalid values.
//
// =================================================================================================
#ifndef CLBLAST_TEST_CORRECTNESS_TESTAB_H_
#define CLBLAST_TEST_CORRECTNESS_TESTAB_H_
#include <vector>
#include <string>
#include "correctness/tester.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestAB: public Tester<T> {
public:
// Uses several variables from the Tester class
using Tester<T>::context_;
using Tester<T>::queue_;
using Tester<T>::kLayouts;
using Tester<T>::kTransposes;
// Uses several helper functions from the Tester class
using Tester<T>::TestStart;
using Tester<T>::TestEnd;
using Tester<T>::TestSimilarity;
using Tester<T>::TestErrorCount;
using Tester<T>::TestErrorCodes;
using Tester<T>::GetExampleScalars;
using Tester<T>::GetOffsets;
using Tester<T>::PrecisionSupported;
// Test settings for the regular test. Append to this list in case more tests are required.
const std::vector<size_t> kMatrixDims = { 7, 64 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<T> kAlphaValues = GetExampleScalars();
const std::vector<T> kBetaValues = GetExampleScalars();
// Test settings for the invalid test
const size_t kBufferSize = 64;
// Shorthand for a BLAS routine
using Routine = std::function<StatusCode(const Arguments<T>&,
const Buffer&, const Buffer&,
CommandQueue&)>;
// Constructor, initializes the base class tester and input data
TestAB(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda);
// The test functions, taking no inputs
void TestRegular(Arguments<T> &args, const std::string &name);
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
private:
// Source data to test with
std::vector<T> a_source_;
std::vector<T> b_source_;
// The routines to test
Routine clblast_lambda_;
Routine clblas_lambda_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_CORRECTNESS_TESTAB_H_
#endif

View file

@ -45,7 +45,7 @@ TestABC<T>::TestABC(int argc, char *argv[], const bool silent,
// Tests the routine for a wide variety of parameters
template <typename T>
void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name, const bool symmetric) {
if (!PrecisionSupported()) { return; }
TestStart("regular behaviour", name);
@ -63,6 +63,7 @@ void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
args.m = m;
for (auto &n: kMatrixDims) {
args.n = n;
if (symmetric && m != n) { continue; }
for (auto &k: kMatrixDims) {
args.k = k;

View file

@ -64,7 +64,7 @@ class TestABC: public Tester<T> {
const Routine clblast_lambda, const Routine clblas_lambda);
// The test functions, taking no inputs
void TestRegular(Arguments<T> &args, const std::string &name);
void TestRegular(Arguments<T> &args, const std::string &name, const bool symmetric);
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
private:

191
test/correctness/testac.cc Normal file
View file

@ -0,0 +1,191 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the TestAC class (see the header for information about the class).
//
// =================================================================================================
#include <algorithm>
#include "correctness/testac.h"
namespace clblast {
// =================================================================================================
// Constructor, initializes the base class tester and input data
template <typename T>
TestAC<T>::TestAC(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda):
Tester<T>{argc, argv, silent, name, options},
clblast_lambda_(clblast_lambda),
clblas_lambda_(clblas_lambda) {
// Computes the maximum sizes. This allows for a single set of input/output buffers.
auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
// Creates test input data
a_source_.resize(max_dim*max_ld + max_offset);
c_source_.resize(max_dim*max_ld + max_offset);
PopulateVector(a_source_);
PopulateVector(c_source_);
}
// ===============================================================================================
// Tests the routine for a wide variety of parameters
template <typename T>
void TestAC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("regular behaviour", name);
// Computes whether or not the matrices are transposed. Note that we assume a default of
// column-major and no-transpose. If one of them is different (but not both), then rotated
// is considered true.
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
auto c_rotated = (args.layout == Layout::kRowMajor);
// Iterates over the matrix dimensions
for (auto &n: kMatrixDims) {
args.n = n;
for (auto &k: kMatrixDims) {
args.k = k;
// Computes the second dimensions of the matrices taking the rotation into account
auto a_two = (a_rotated) ? n : k;
auto c_two = (c_rotated) ? n : n;
// Iterates over the leading-dimension values and the offsets
for (auto &a_ld: kMatrixDims) {
args.a_ld = a_ld;
for (auto &a_offset: kOffsets) {
args.a_offset = a_offset;
for (auto &c_ld: kMatrixDims) {
args.c_ld = c_ld;
for (auto &c_offset: kOffsets) {
args.c_offset = c_offset;
// Computes the buffer sizes
auto a_size = a_two * a_ld + a_offset;
auto c_size = c_two * c_ld + c_offset;
if (a_size < 1 || c_size < 1) { continue; }
// Creates the OpenCL buffers
auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
// Iterates over the values for alpha and beta
for (auto &alpha: kAlphaValues) {
args.alpha = alpha;
for (auto &beta: kBetaValues) {
args.beta = beta;
// Runs the reference clBLAS code
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
// Runs the CLBlast code
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
// Tests for equality of the two status codes
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
TestErrorCodes(status1, status2, args);
continue;
}
// Downloads the results
std::vector<T> r_result(c_size, static_cast<T>(0));
std::vector<T> s_result(c_size, static_cast<T>(0));
r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
// Checks for differences in the output
auto errors = size_t{0};
for (auto idn0=size_t{0}; idn0<n; ++idn0) {
for (auto idn1=size_t{0}; idn1<n; ++idn1) {
auto index = idn0*args.c_ld + idn1 + args.c_offset;
if (!TestSimilarity(r_result[index], s_result[index])) {
errors++;
}
}
}
// Tests the error count (should be zero)
TestErrorCount(errors, n*n, args);
}
}
}
}
}
}
}
}
TestEnd();
}
// =================================================================================================
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
// does not test for results (if any).
template <typename T>
void TestAC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("invalid buffer sizes", name);
// Sets example test parameters
args.m = kBufferSize;
args.n = kBufferSize;
args.k = kBufferSize;
args.a_ld = kBufferSize;
args.c_ld = kBufferSize;
args.a_offset = 0;
args.c_offset = 0;
// Iterates over test buffer sizes
const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
for (auto &a_size: kBufferSizes) {
for (auto &c_size: kBufferSizes) {
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
// want to be able to create invalid buffers (no error checking here).
auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
auto a_mat = Buffer(a);
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
auto r_mat = Buffer(r);
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
auto s_mat = Buffer(s);
// Runs the two routines
auto status1 = clblas_lambda_(args, a_mat, r_mat, queue_);
auto status2 = clblast_lambda_(args, a_mat, s_mat, queue_);
// Tests for equality of the two status codes
TestErrorCodes(status1, status2, args);
}
}
TestEnd();
}
// =================================================================================================
// Compiles the templated class
template class TestAC<float>;
template class TestAC<double>;
template class TestAC<float2>;
template class TestAC<double2>;
// =================================================================================================
} // namespace clblast

85
test/correctness/testac.h Normal file
View file

@ -0,0 +1,85 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file tests any mat-mat (A,C) routine. It contains two types of tests: one testing
// all sorts of input combinations, and one deliberatly testing with invalid values.
//
// =================================================================================================
#ifndef CLBLAST_TEST_CORRECTNESS_TESTAC_H_
#define CLBLAST_TEST_CORRECTNESS_TESTAC_H_
#include <vector>
#include <string>
#include "correctness/tester.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestAC: public Tester<T> {
public:
// Uses several variables from the Tester class
using Tester<T>::context_;
using Tester<T>::queue_;
using Tester<T>::kLayouts;
using Tester<T>::kTransposes;
// Uses several helper functions from the Tester class
using Tester<T>::TestStart;
using Tester<T>::TestEnd;
using Tester<T>::TestSimilarity;
using Tester<T>::TestErrorCount;
using Tester<T>::TestErrorCodes;
using Tester<T>::GetExampleScalars;
using Tester<T>::GetOffsets;
using Tester<T>::PrecisionSupported;
// Test settings for the regular test. Append to this list in case more tests are required.
const std::vector<size_t> kMatrixDims = { 7, 64 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<T> kAlphaValues = GetExampleScalars();
const std::vector<T> kBetaValues = GetExampleScalars();
// Test settings for the invalid test
const size_t kBufferSize = 64;
// Shorthand for a BLAS routine
using Routine = std::function<StatusCode(const Arguments<T>&,
const Buffer&, const Buffer&,
CommandQueue&)>;
// Constructor, initializes the base class tester and input data
TestAC(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda);
// The test functions, taking no inputs
void TestRegular(Arguments<T> &args, const std::string &name);
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
private:
// Source data to test with
std::vector<T> a_source_;
std::vector<T> c_source_;
// The routines to test
Routine clblast_lambda_;
Routine clblas_lambda_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_CORRECTNESS_TESTAC_H_
#endif

View file

@ -115,7 +115,7 @@ void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
// Computes the vector sizes in case the matrix is transposed
auto a_transposed = (args.a_transpose == Transpose::kYes);
auto a_transposed = (args.a_transpose != Transpose::kNo);
auto m_real = (a_transposed) ? args.n : args.m;
auto n_real = (a_transposed) ? args.m : args.n;
@ -163,13 +163,13 @@ template void ClientAXY<double2>(int, char **, Routine3<double2>, const std::vec
// =================================================================================================
// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
// This is the matrix-matrix variant of the set-up/tear-down client routine.
template <typename T>
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options) {
void ClientAC(int argc, char *argv[], Routine2<T> client_routine,
const std::vector<std::string> &options) {
// Function to determine how to find the default value of the leading dimension of matrix A
auto default_ld_a = [](const Arguments<T> args) { return args.m; };
auto default_ld_a = [](const Arguments<T> args) { return args.k; };
// Simple command line argument parser with defaults
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
@ -188,9 +188,83 @@ void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
// Computes whether or not the matrices are transposed. Note that we assume a default of
// column-major and no-transpose. If one of them is different (but not both), then rotated
// is considered true.
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) ||
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) ||
// Iterates over all "num_step" values jumping by "step" each time
auto s = size_t{0};
while(true) {
// Computes the data sizes
auto a_two = (a_rotated) ? args.n : args.k;
auto a_size = a_two * args.a_ld + args.a_offset;
auto c_size = args.n * args.c_ld + args.c_offset;
// Populates input host matrices with random data
std::vector<T> a_source(a_size);
std::vector<T> c_source(c_size);
PopulateVector(a_source);
PopulateVector(c_source);
// Creates the matrices on the device
auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
// Runs the routine-specific code
client_routine(args, a_buffer, c_buffer, queue);
// Makes the jump to the next step
++s;
if (s >= args.num_steps) { break; }
args.n += args.step;
args.k += args.step;
args.a_ld += args.step;
args.c_ld += args.step;
}
// Cleans-up and returns
if (args.compare_clblas) { clblasTeardown(); }
}
// Compiles the above function
template void ClientAC<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
template void ClientAC<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
template void ClientAC<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
template void ClientAC<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
// =================================================================================================
// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
template <typename T>
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options, const bool symmetric) {
// Function to determine how to find the default value of the leading dimension of matrix A
auto default_ld_a = [&symmetric](const Arguments<T> args) { return (symmetric) ? args.n : args.m; };
// Simple command line argument parser with defaults
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
if (args.print_help) { return; }
if (symmetric) { args.m = args.n; }
// Prints the header of the output table
PrintTableHeader(args.silent, options);
// Initializes OpenCL and the libraries
auto platform = Platform(args.platform_id);
auto device = Device(platform, kDeviceType, args.device_id);
auto context = Context(device);
auto queue = CommandQueue(context, device);
if (args.compare_clblas) { clblasSetup(); }
// Computes whether or not the matrices are transposed. Note that we assume a default of
// column-major and no-transpose. If one of them is different (but not both), then rotated
// is considered true.
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
(args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
auto c_rotated = (args.layout == Layout::kRowMajor);
@ -241,10 +315,10 @@ void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
}
// Compiles the above function
template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&, const bool);
template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&, const bool);
template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&, const bool);
template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&, const bool);
// =================================================================================================

View file

@ -52,8 +52,11 @@ template <typename T>
void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options);
template <typename T>
void ClientAC(int argc, char *argv[], Routine2<T> client_routine,
const std::vector<std::string> &options);
template <typename T>
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options);
const std::vector<std::string> &options, const bool symmetric);
// =================================================================================================

View file

@ -0,0 +1,94 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xsyr2k routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xsyr2k"
parameters <- c("-n","-k","-layout","-triangle","-transA",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 128",
"multiples of 128 (+1)",
"around n=k=512",
"around n=k=1536",
"layouts and transposing (n=k=1024)",
"powers of 2"
)
# Defines the test-cases
test_values <- list(
list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
list(c(1536, 1536, 0, 0, 0, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
),
list(
c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"layout (row/col), triangle (u/l), transA (n/y)",
"matrix sizes (n=k)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("n", ""),
c("n", ""),
c("n", ""),
c("n", ""),
list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
"col,u,n", "col,u,y", "col,l,n", "col,l,y")),
c("n", "x")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
# ==================================================================================================

View file

@ -0,0 +1,94 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xsyrk routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xsyrk"
parameters <- c("-n","-k","-layout","-triangle","-transA",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 128",
"multiples of 128 (+1)",
"around n=k=512",
"around n=k=2048",
"layouts and transposing (n=k=1024)",
"powers of 2"
)
# Defines the test-cases
test_values <- list(
list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
),
list(
c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"layout (row/col), triangle (u/l), transA (n/y)",
"matrix sizes (n=k)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("n", ""),
c("n", ""),
c("n", ""),
c("n", ""),
list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
"col,u,n", "col,u,y", "col,l,n", "col,l,y")),
c("n", "x")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
# ==================================================================================================

View file

@ -72,7 +72,7 @@ void PerformanceXgemm(const Arguments<T> &args,
// Prints the performance of both libraries
const auto flops = 2 * args.m * args.n * args.k;
const auto bytes = (args.m*args.k + args.k*args.n + args.m*args.n) * sizeof(T);
const auto bytes = (args.m*args.k + args.k*args.n + 2*args.m*args.n) * sizeof(T);
const auto output_ints = std::vector<size_t>{args.m, args.n, args.k,
static_cast<size_t>(args.layout),
static_cast<size_t>(args.a_transpose),
@ -96,10 +96,10 @@ void ClientXgemm(int argc, char *argv[]) {
kArgAlpha, kArgBeta};
switch(GetPrecision(argc, argv)) {
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXgemm<float>, o); break;
case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXgemm<double>, o); break;
case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode");
case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode");
case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXgemm<float>, o, false); break;
case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXgemm<double>, o, false); break;
case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXgemm<float2>, o, false); break;
case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXgemm<double2>, o, false); break;
}
}

View file

@ -72,7 +72,7 @@ void PerformanceXsymm(const Arguments<T> &args,
// Prints the performance of both libraries
const auto flops = 2 * args.m * args.n * args.m;
const auto bytes = (args.m*args.m + args.m*args.n + args.m*args.n) * sizeof(T);
const auto bytes = (args.m*args.m + args.m*args.n + 2*args.m*args.n) * sizeof(T);
const auto output_ints = std::vector<size_t>{args.m, args.n,
static_cast<size_t>(args.layout),
static_cast<size_t>(args.triangle),
@ -96,10 +96,10 @@ void ClientXsymm(int argc, char *argv[]) {
kArgAlpha, kArgBeta};
switch(GetPrecision(argc, argv)) {
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsymm<float>, o); break;
case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsymm<double>, o); break;
case Precision::kComplexSingle: throw std::runtime_error("Unsupported precision mode");
case Precision::kComplexDouble: throw std::runtime_error("Unsupported precision mode");
case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsymm<float>, o, false); break;
case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsymm<double>, o, false); break;
case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXsymm<float2>, o, false); break;
case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXsymm<double2>, o, false); break;
}
}

View file

@ -0,0 +1,115 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyr2k command-line interface tester.
//
// =================================================================================================
#include <string>
#include <vector>
#include <exception>
#include "wrapper_clblas.h"
#include "performance/client.h"
namespace clblast {
// =================================================================================================
// The client, used for performance testing. It contains the function calls to CLBlast and to other
// libraries to compare against.
template <typename T>
void PerformanceXsyr2k(const Arguments<T> &args,
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
CommandQueue &queue) {
// Creates the CLBlast lambda
auto clblast_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syr2k(args.layout, args.triangle, args.a_transpose,
args.n, args.k,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
b_mat(), args.b_offset, args.b_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
clWaitForEvents(1, &event);
if (status != StatusCode::kSuccess) {
throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
}
};
// Creates the clBLAS lambda (for comparison)
auto clblas_lambda = [&args, &a_mat, &b_mat, &c_mat, &queue]() {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyr2k(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
b_mat(), args.b_offset, args.b_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
if (status != CL_SUCCESS) {
throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
}
};
// Runs the routines and collect the timings
auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
// Prints the performance of both libraries
const auto flops = 2 * args.n * args.n * args.k;
const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T);
const auto output_ints = std::vector<size_t>{args.n, args.k,
static_cast<size_t>(args.layout),
static_cast<size_t>(args.triangle),
static_cast<size_t>(args.a_transpose),
args.a_ld, args.b_ld, args.c_ld,
args.a_offset, args.b_offset, args.c_offset};
const auto output_strings = std::vector<std::string>{ToString(args.alpha),
ToString(args.beta)};
PrintTableRow(output_ints, output_strings, args.no_abbrv,
ms_clblast, ms_clblas, flops, bytes);
}
// =================================================================================================
// Main function which calls the common client code with the routine-specific function as argument.
void ClientXsyr2k(int argc, char *argv[]) {
const auto o = std::vector<std::string>{kArgN, kArgK,
kArgLayout, kArgTriangle, kArgATransp,
kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
kArgAOffset, kArgBOffset, kArgCOffset,
kArgAlpha, kArgBeta};
switch(GetPrecision(argc, argv)) {
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case Precision::kSingle: ClientABC<float>(argc, argv, PerformanceXsyr2k<float>, o, true); break;
case Precision::kDouble: ClientABC<double>(argc, argv, PerformanceXsyr2k<double>, o, true); break;
case Precision::kComplexSingle: ClientABC<float2>(argc, argv, PerformanceXsyr2k<float2>, o, true); break;
case Precision::kComplexDouble: ClientABC<double2>(argc, argv, PerformanceXsyr2k<double2>, o, true); break;
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::ClientXsyr2k(argc, argv);
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,113 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyrk command-line interface tester.
//
// =================================================================================================
#include <string>
#include <vector>
#include <exception>
#include "wrapper_clblas.h"
#include "performance/client.h"
namespace clblast {
// =================================================================================================
// The client, used for performance testing. It contains the function calls to CLBlast and to other
// libraries to compare against.
template <typename T>
void PerformanceXsyrk(const Arguments<T> &args,
const Buffer &a_mat, const Buffer &c_mat,
CommandQueue &queue) {
// Creates the CLBlast lambda
auto clblast_lambda = [&args, &a_mat, &c_mat, &queue]() {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Syrk(args.layout, args.triangle, args.a_transpose,
args.n, args.k,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
clWaitForEvents(1, &event);
if (status != StatusCode::kSuccess) {
throw std::runtime_error("CLBlast error: "+ToString(static_cast<int>(status)));
}
};
// Creates the clBLAS lambda (for comparison)
auto clblas_lambda = [&args, &a_mat, &c_mat, &queue]() {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsyrk(static_cast<clblasOrder>(args.layout),
static_cast<clblasUplo>(args.triangle),
static_cast<clblasTranspose>(args.a_transpose),
args.n, args.k,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
if (status != CL_SUCCESS) {
throw std::runtime_error("clBLAS error: "+ToString(static_cast<int>(status)));
}
};
// Runs the routines and collect the timings
auto ms_clblast = TimedExecution(args.num_runs, clblast_lambda);
auto ms_clblas = TimedExecution(args.num_runs, clblas_lambda);
// Prints the performance of both libraries
const auto flops = args.n * args.n * args.k;
const auto bytes = (args.n*args.k + args.n*args.n) * sizeof(T);
const auto output_ints = std::vector<size_t>{args.n, args.k,
static_cast<size_t>(args.layout),
static_cast<size_t>(args.triangle),
static_cast<size_t>(args.a_transpose),
args.a_ld, args.c_ld,
args.a_offset, args.c_offset};
const auto output_strings = std::vector<std::string>{ToString(args.alpha),
ToString(args.beta)};
PrintTableRow(output_ints, output_strings, args.no_abbrv,
ms_clblast, ms_clblas, flops, bytes);
}
// =================================================================================================
// Main function which calls the common client code with the routine-specific function as argument.
void ClientXsyrk(int argc, char *argv[]) {
const auto o = std::vector<std::string>{kArgN, kArgK,
kArgLayout, kArgTriangle, kArgATransp,
kArgALeadDim, kArgCLeadDim,
kArgAOffset, kArgCOffset,
kArgAlpha, kArgBeta};
switch(GetPrecision(argc, argv)) {
case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case Precision::kSingle: ClientAC<float>(argc, argv, PerformanceXsyrk<float>, o); break;
case Precision::kDouble: ClientAC<double>(argc, argv, PerformanceXsyrk<double>, o); break;
case Precision::kComplexSingle: ClientAC<float2>(argc, argv, PerformanceXsyrk<float2>, o); break;
case Precision::kComplexDouble: ClientAC<double2>(argc, argv, PerformanceXsyrk<double2>, o); break;
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::ClientXsyrk(argc, argv);
return 0;
}
// =================================================================================================

View file

@ -201,7 +201,7 @@ clblasStatus clblasXgemm(
num_queues, queues, num_wait_events, wait_events, events);
}
// This calls {clblasSsymm, clblasDsymm} with the arguments forwarded.
// This calls {clblasSsymm, clblasDsymm, clblasCsymm, clblasZsymm} with the arguments forwarded.
clblasStatus clblasXsymm(
clblasOrder layout, clblasSide side, clblasUplo triangle,
size_t m, size_t n, float alpha,
@ -267,6 +267,130 @@ clblasStatus clblasXsymm(
num_queues, queues, num_wait_events, wait_events, events);
}
// This calls {clblasSsyrk, clblasDsyrk, clblasCsyrk, clblasZsyrk} with the arguments forwarded.
clblasStatus clblasXsyrk(
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
size_t n, size_t k, float alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld, float beta,
cl_mem c_mat, size_t c_offset, size_t c_ld,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
return clblasSsyrk(layout, triangle, tran_a,
n, k, alpha,
a_mat, a_offset, a_ld, beta,
c_mat, c_offset, c_ld,
num_queues, queues, num_wait_events, wait_events, events);
}
clblasStatus clblasXsyrk(
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
size_t n, size_t k, double alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld, double beta,
cl_mem c_mat, size_t c_offset, size_t c_ld,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
return clblasDsyrk(layout, triangle, tran_a,
n, k, alpha,
a_mat, a_offset, a_ld, beta,
c_mat, c_offset, c_ld,
num_queues, queues, num_wait_events, wait_events, events);
}
clblasStatus clblasXsyrk(
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
size_t n, size_t k, float2 alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld, float2 beta,
cl_mem c_mat, size_t c_offset, size_t c_ld,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
return clblasCsyrk(layout, triangle, tran_a,
n, k, cl_alpha,
a_mat, a_offset, a_ld, cl_beta,
c_mat, c_offset, c_ld,
num_queues, queues, num_wait_events, wait_events, events);
}
clblasStatus clblasXsyrk(
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_a,
size_t n, size_t k, double2 alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld, double2 beta,
cl_mem c_mat, size_t c_offset, size_t c_ld,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
return clblasZsyrk(layout, triangle, tran_a,
n, k, cl_alpha,
a_mat, a_offset, a_ld, cl_beta,
c_mat, c_offset, c_ld,
num_queues, queues, num_wait_events, wait_events, events);
}
// This calls {clblasSsyr2k, clblasDsyr2k, clblasCsyr2k, clblasZsyr2k} with the arguments forwarded.
clblasStatus clblasXsyr2k(
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
size_t n, size_t k, float alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld,
const cl_mem b_mat, size_t b_offset, size_t b_ld, float beta,
cl_mem c_mat, size_t c_offset, size_t c_ld,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
return clblasSsyr2k(layout, triangle, tran_ab,
n, k, alpha,
a_mat, a_offset, a_ld,
b_mat, b_offset, b_ld, beta,
c_mat, c_offset, c_ld,
num_queues, queues, num_wait_events, wait_events, events);
}
clblasStatus clblasXsyr2k(
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
size_t n, size_t k, double alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld,
const cl_mem b_mat, size_t b_offset, size_t b_ld, double beta,
cl_mem c_mat, size_t c_offset, size_t c_ld,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
return clblasDsyr2k(layout, triangle, tran_ab,
n, k, alpha,
a_mat, a_offset, a_ld,
b_mat, b_offset, b_ld, beta,
c_mat, c_offset, c_ld,
num_queues, queues, num_wait_events, wait_events, events);
}
clblasStatus clblasXsyr2k(
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
size_t n, size_t k, float2 alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld,
const cl_mem b_mat, size_t b_offset, size_t b_ld, float2 beta,
cl_mem c_mat, size_t c_offset, size_t c_ld,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto cl_alpha = cl_float2{{alpha.real(), alpha.imag()}};
auto cl_beta = cl_float2{{beta.real(), beta.imag()}};
return clblasCsyr2k(layout, triangle, tran_ab,
n, k, cl_alpha,
a_mat, a_offset, a_ld,
b_mat, b_offset, b_ld, cl_beta,
c_mat, c_offset, c_ld,
num_queues, queues, num_wait_events, wait_events, events);
}
clblasStatus clblasXsyr2k(
clblasOrder layout, clblasUplo triangle, clblasTranspose tran_ab,
size_t n, size_t k, double2 alpha,
const cl_mem a_mat, size_t a_offset, size_t a_ld,
const cl_mem b_mat, size_t b_offset, size_t b_ld, double2 beta,
cl_mem c_mat, size_t c_offset, size_t c_ld,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto cl_alpha = cl_double2{{alpha.real(), alpha.imag()}};
auto cl_beta = cl_double2{{beta.real(), beta.imag()}};
return clblasZsyr2k(layout, triangle, tran_ab,
n, k, cl_alpha,
a_mat, a_offset, a_ld,
b_mat, b_offset, b_ld, cl_beta,
c_mat, c_offset, c_ld,
num_queues, queues, num_wait_events, wait_events, events);
}
// =================================================================================================
} // namespace clblast