Merge pull request #15 from CNugteren/development

Update to version 0.3.0
This commit is contained in:
Cedric Nugteren 2015-07-24 08:30:41 +02:00
commit db6846b791
127 changed files with 6575 additions and 2664 deletions

View file

@ -1,4 +1,16 @@
Version 0.3.0
- Re-organized test/client infrastructure to avoid code duplication
- Added an optional bypass for pre/post-processing kernels in level-3 routines
- Significantly improved performance of level-3 routines on AMD GPUs
- Added level-3 routines:
* CHEMM/ZHEMM
* SSYRK/DSYRK/CSYRK/ZSYRK
* CHERK/ZHERK
* SSYR2K/DSYR2K/CSYR2K/ZSYR2K
* CHER2K/ZHER2K
* STRMM/DTRMM/CTRMM/ZTRMM
Version 0.2.0
- Added support for complex conjugate transpose
- Several host-code performance improvements

View file

@ -13,7 +13,7 @@
cmake_minimum_required(VERSION 2.8.10)
project("clblast" CXX)
set(clblast_VERSION_MAJOR 0)
set(clblast_VERSION_MINOR 2)
set(clblast_VERSION_MINOR 3)
set(clblast_VERSION_PATCH 0)
# Options and their default values
@ -95,17 +95,23 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
set(SAMPLE_PROGRAMS sgemm)
set(ROUTINES_XY xaxpy)
set(ROUTINES_AXY xgemv)
set(ROUTINES_ABC xgemm xsymm)
set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC})
set(LEVEL1_ROUTINES xaxpy)
set(LEVEL2_ROUTINES xgemv)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
# ==================================================================================================
# Gathers all source-files
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc)
foreach(ROUTINE ${ROUTINES})
set(SOURCES ${SOURCES} src/routines/${ROUTINE}.cc)
foreach(ROUTINE ${LEVEL1_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${LEVEL2_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${LEVEL3_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
endforeach()
# Creates and links the library
@ -168,33 +174,23 @@ if(TESTS)
include_directories(${clblast_SOURCE_DIR}/test ${clBLAS_SOURCE_DIR})
# Creates the common correctness-tests objects (requires CMake 2.8.8)
add_library(test_correctness_common OBJECT test/correctness/tester.cc)
add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
add_library(test_correctness_common OBJECT
test/correctness/tester.cc test/correctness/testblas.cc)
# Compiles the correctness-tests
foreach(ROUTINE ${ROUTINES_XY})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>
$<TARGET_OBJECTS:test_correctness_xy>
test/correctness/routines/${ROUTINE}.cc)
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
foreach(ROUTINE ${LEVEL1_ROUTINES})
add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
test/correctness/routines/level1/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES_AXY})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>
$<TARGET_OBJECTS:test_correctness_axy>
test/correctness/routines/${ROUTINE}.cc)
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
foreach(ROUTINE ${LEVEL2_ROUTINES})
add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
test/correctness/routines/level2/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES_ABC})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>
$<TARGET_OBJECTS:test_correctness_abc>
test/correctness/routines/${ROUTINE}.cc)
foreach(ROUTINE ${LEVEL3_ROUTINES})
add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
test/correctness/routines/level3/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
endforeach()
@ -203,10 +199,19 @@ if(TESTS)
add_library(test_performance_common OBJECT test/performance/client.cc)
# Compiles the performance-tests
set(TEST_PERF_COMM )
foreach(ROUTINE ${ROUTINES})
foreach(ROUTINE ${LEVEL1_ROUTINES})
add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
test/performance/routines/${ROUTINE}.cc)
test/performance/routines/level1/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${LEVEL2_ROUTINES})
add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
test/performance/routines/level2/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${LEVEL3_ROUTINES})
add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
test/performance/routines/level3/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(client_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS client_${ROUTINE} DESTINATION bin)
endforeach()

View file

@ -4,7 +4,7 @@ CLBlast: The tuned OpenCL BLAS library
CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version supports only a minimal amount of routines (including `gemm` and `gemv`): others will be added in due time. It also lacks extensive tuning and testing on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
Why CLBlast and not clBLAS or cuBLAS?
@ -109,13 +109,13 @@ Performance remarks
The CLBlast library provides pre-tuned parameter-values for a number of OpenCL devices. If your device is not among these, then out-of-the-box performance might be poor. Even if the device is included performance might be poor in some cases: __the preview version is not thoroughly tested for performance yet__. See above under `Using the tuners` to find out how to tune for your device.
The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm and Xsymm) show the strong points of CLBlast:
The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm, Xsymm, Xsyrk) show the strong points of CLBlast:
* The library reaches a high peak performance for large matrix sizes, in some cases a factor 2 more than clBLAS.
* The performance for non-power of 2 values (e.g. 1000) is roughly equal to power of 2 cases (e.g. 1024). This is not the case for clBLAS, which sometimes shows a drop of a factor 2.
* The performance is also constant for different layouts and transpose options. Again, this is not the case for clBLAS.
The graphs also show the current weak point of CLBlast: its performance for smaller matrix sizes is not too good. Furthermore, although the GEMM kernels perform well on AMD GPUs, the supporting copy and transpose kernel do not.
The graphs also show the current weak points of CLBlast: for small sizes the benefit is minimal or non-existent, and for some specific configurations clBLAS is still faster.
These graphs can be generated automatically on your own device. First, compile CLBlast with the tests enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `test/performance/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0:
@ -124,7 +124,7 @@ These graphs can be generated automatically on your own device. First, compile C
Supported routines
-------------
CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with `x` in the following tables:
CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
| Level-1 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
@ -135,7 +135,7 @@ CLBlast is in active development and currently does not support the full set of
| xSWAP | | | | | |
| xSCAL | | | | | +CS +ZD |
| xCOPY | | | | | |
| xAXPY |`x`|`x`|`x`|`x`| |
| xAXPY | ✔ | ✔ | ✔ | ✔ | |
| xDOT | | | - | - | +DS |
| xDOTU | - | - | | | |
| xDOTC | - | - | | | |
@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of
| Level-2 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMV |`x`|`x`|`x`|`x`| |
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
| xGBMV | | | | | |
| xHEMV | - | - | | | |
| xHBMV | - | - | | | |
@ -175,14 +175,14 @@ CLBlast is in active development and currently does not support the full set of
| Level-3 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMM |`x`|`x`|`x`|`x`| |
| xSYMM |`x`|`x`|`x`|`x`| |
| xHEMM | - | - | | | |
| xSYRK | | | | | |
| xHERK | - | - | | | |
| xSYR2K | | | | | |
| xHER2K | - | - | | | |
| xTRMM | | | | | |
| xGEMM | ✔ | ✔ | ✔ | ✔ | |
| xSYMM | ✔ | ✔ | ✔ | ✔ | |
| xHEMM | - | - | ✔ | ✔ | |
| xSYRK | ✔ | ✔ | ✔ | ✔ | |
| xHERK | - | - | ✔ | ✔ | |
| xSYR2K | ✔ | ✔ | ✔ | ✔ | |
| xHER2K | - | - | ✔ | ✔ | |
| xTRMM | ✔ | ✔ | ✔ | ✔ | |
| xTRSM | | | | | |
@ -214,8 +214,6 @@ To-do list before release of version 1.0
- Improve host performance:
* Allow initialization to pre-compile kernels and store to disk
- Improve device performance:
* Enable 'mad()' for AMD devices
* Improve the performance of the copy and transpose kernels
* Tune for a wider range of devices
* Allow users to define custom tuned parameters
- Improve the tuning

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -75,6 +75,7 @@ enum class Layout { kRowMajor, kColMajor };
enum class Transpose { kNo, kYes, kConjugate };
enum class Side { kLeft, kRight };
enum class Triangle { kUpper, kLower };
enum class Diagonal { kUnit, kNonUnit };
// Precision scoped enum (values in bits)
enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
@ -95,7 +96,7 @@ StatusCode Axpy(const size_t n, const T alpha,
// Templated-precision generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
template <typename T>
StatusCode Gemv(const Layout layout, const Transpose transpose_a,
StatusCode Gemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@ -107,9 +108,9 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM
// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
template <typename T>
StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@ -118,7 +119,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM
// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
template <typename T>
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
@ -129,6 +130,81 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision hermitian matrix-matrix multiplication: CHEMM/ZHEMM
template <typename T>
StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
template <typename T>
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision rank-K update of a hermitian matrix: CHERK/ZHERK
template <typename T>
StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
template <typename T>
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision rank-2K update of a hermitian matrix: CHER2K/ZHER2K
template <typename T, typename U>
StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const U beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
template <typename T>
StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
// Templated-precision matrix equation solver: STRSM/DTRSM/CTRSM/ZTRSM
/*
template <typename T>
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
*/
// =================================================================================================
} // namespace clblast

View file

@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::CopySingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
}
},
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
}
},
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
}
},
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
}
},

View file

@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::PadSingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::PadDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},

View file

@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::PadTraSingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
}
},
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::PadTraDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
}
},
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::PadTraComplexSingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
}
},
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadTraComplexDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
}
},

View file

@ -18,24 +18,24 @@ const Database::DatabaseEntry Database::TraSingle = {
"Transpose", Precision::kSingle, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
{ "Iris", { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0} } },
{ "Iris", { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
}
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
}
},
}
@ -47,14 +47,14 @@ const Database::DatabaseEntry Database::TraDouble = {
"Transpose", Precision::kDouble, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
}
},
{ // Intel GPUs
@ -63,7 +63,7 @@ const Database::DatabaseEntry Database::TraDouble = {
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
}
},
}
@ -75,24 +75,24 @@ const Database::DatabaseEntry Database::TraComplexSingle = {
"Transpose", Precision::kComplexSingle, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
{ "Iris", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
{ "Iris", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
}
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
}
},
}
@ -104,14 +104,14 @@ const Database::DatabaseEntry Database::TraComplexDouble = {
"Transpose", Precision::kComplexDouble, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
}
},
{ // Intel GPUs
@ -120,7 +120,7 @@ const Database::DatabaseEntry Database::TraComplexDouble = {
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
}
},
}

View file

@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",2} } },
}
},
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
}
},
@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
}
},
@ -109,7 +109,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
}
},

View file

@ -25,8 +25,8 @@ const Database::DatabaseEntry Database::XgemmSingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",8}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"MWG",128}, {"NWG",128}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",8}, {"KWI",2}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
}
},
{ // Intel GPUs
@ -55,7 +55,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
}
},
@ -84,13 +84,13 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
{ "Iris", { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
{ "Iris", { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
}
},
{ // Default
@ -114,7 +114,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
}
},

View file

@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
@ -109,7 +109,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},

View file

@ -34,20 +34,14 @@ class Routine {
Program program;
std::string device_name;
Precision precision;
std::vector<std::string> routines;
std::string routine_name_;
// Finds out whether the properties match
bool MatchInCache(const std::string &ref_name, const Precision &ref_precision,
const std::vector<std::string> &ref_routines) {
auto ref_size = ref_routines.size();
if (device_name == ref_name && precision == ref_precision && routines.size() == ref_size) {
auto found_match = true;
for (auto i=size_t{0}; i<ref_size; ++i) {
if (routines[i] != ref_routines[i]) { found_match = false; }
}
return found_match;
}
return false;
bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
const std::string &ref_routine) {
return (device_name == ref_device &&
precision == ref_precision &&
routine_name_ == ref_routine);
}
};
@ -58,11 +52,11 @@ class Routine {
static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
// Base class constructor
explicit Routine(CommandQueue &queue, Event &event,
explicit Routine(CommandQueue &queue, Event &event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision);
// Set-up phase of the kernel
StatusCode SetUp(const std::string &routine_source);
StatusCode SetUp();
protected:
@ -84,15 +78,18 @@ class Routine {
StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
const size_t inc, const size_t data_size);
// Copies/transposes a matrix and padds/unpads it
// Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
// to symmetric and triangular matrices through optional arguments.
StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer &dest,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool pad, const Program &program);
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false);
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
// The first assumes that the program is available in the cache and will throw an exception
@ -104,6 +101,10 @@ class Routine {
// a derived class.
const Precision precision_;
// The routine's name and its kernel-source in string form
const std::string routine_name_;
std::string source_string_;
// The OpenCL objects, accessible only from derived classes
CommandQueue queue_;
Event event_;
@ -118,7 +119,6 @@ class Routine {
// Connection to the database for all the device-specific parameters
const Database db_;
const std::vector<std::string> routines_;
};
// =================================================================================================

View file

@ -0,0 +1,58 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xhemm routine. It is based on the generalized matrix multiplication
// routine (Xgemm). The implementation is very similar to the Xsymm routine.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XHEMM_H_
#define CLBLAST_ROUTINES_XHEMM_H_
#include "internal/routines/level3/xgemm.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xhemm: public Xgemm<T> {
public:
// Uses several variables from the Routine class
using Routine::db_;
using Routine::context_;
// Uses several helper functions from the Routine class
using Routine::RunKernel;
using Routine::ErrorIn;
using Routine::TestMatrixA;
using Routine::GetProgramFromCache;
// Uses the regular Xgemm routine
using Xgemm<T>::DoGemm;
// Constructor
Xhemm(CommandQueue &queue, Event &event);
// Templated-precision implementation of the routine
StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XHEMM_H_
#endif

View file

@ -0,0 +1,48 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xher2k routine. The precision is implemented using the template argument
// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
// Xsyr2k routine.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XHER2K_H_
#define CLBLAST_ROUTINES_XHER2K_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T, typename U>
class Xher2k: public Routine {
public:
Xher2k(CommandQueue &queue, Event &event);
// Templated-precision implementation of the routine
StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
const U beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XHER2K_H_
#endif

View file

@ -0,0 +1,47 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xherk routine. The precision is implemented using the template argument
// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
// Xsyrk routine.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XHERK_H_
#define CLBLAST_ROUTINES_XHERK_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T, typename U>
class Xherk: public Routine {
public:
Xherk(CommandQueue &queue, Event &event);
// Templated-precision implementation of the routine
StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const U alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const U beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XHERK_H_
#endif

View file

@ -17,7 +17,7 @@
#ifndef CLBLAST_ROUTINES_XSYMM_H_
#define CLBLAST_ROUTINES_XSYMM_H_
#include "internal/routines/xgemm.h"
#include "internal/routines/level3/xgemm.h"
namespace clblast {
// =================================================================================================

View file

@ -0,0 +1,48 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
// The implementation is very similar to Xsyrk (see header for details), except for the fact that
// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XSYR2K_H_
#define CLBLAST_ROUTINES_XSYR2K_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xsyr2k: public Routine {
public:
Xsyr2k(CommandQueue &queue, Event &event);
// Templated-precision implementation of the routine
StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XSYR2K_H_
#endif

View file

@ -0,0 +1,49 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyrk routine. The precision is implemented using a template argument.
// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
// performance reasons, as the actual masking is done later (see the first point).
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XSYRK_H_
#define CLBLAST_ROUTINES_XSYRK_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xsyrk: public Routine {
public:
Xsyrk(CommandQueue &queue, Event &event);
// Templated-precision implementation of the routine
StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XSYRK_H_
#endif

View file

@ -0,0 +1,58 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xtrmm routine. The implementation is based on first transforming the
// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM
// routine. Therefore, this class inherits from the Xgemm class.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XTRMM_H_
#define CLBLAST_ROUTINES_XTRMM_H_
#include "internal/routines/level3/xgemm.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xtrmm: public Xgemm<T> {
public:
// Uses several variables from the Routine class
using Routine::db_;
using Routine::context_;
// Uses several helper functions from the Routine class
using Routine::RunKernel;
using Routine::ErrorIn;
using Routine::TestMatrixA;
using Routine::GetProgramFromCache;
// Uses the regular Xgemm routine
using Xgemm<T>::DoGemm;
// Constructor
Xtrmm(CommandQueue &queue, Event &event);
// Templated-precision implementation of the routine
StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XTRMM_H_
#endif

View file

@ -46,6 +46,7 @@ constexpr auto kArgATransp = "transA";
constexpr auto kArgBTransp = "transB";
constexpr auto kArgSide = "side";
constexpr auto kArgTriangle = "triangle";
constexpr auto kArgDiagonal = "diagonal";
constexpr auto kArgXInc = "incx";
constexpr auto kArgYInc = "incy";
constexpr auto kArgXOffset = "offx";
@ -93,6 +94,7 @@ struct Arguments {
Transpose b_transpose = Transpose::kNo;
Side side = Side::kLeft;
Triangle triangle = Triangle::kUpper;
Diagonal diagonal = Diagonal::kUnit;
size_t x_inc = 1;
size_t y_inc = 1;
size_t x_offset = 0;
@ -105,6 +107,11 @@ struct Arguments {
size_t c_offset = 0;
T alpha = T{1.0};
T beta = T{1.0};
size_t x_size = 1;
size_t y_size = 1;
size_t a_size = 1;
size_t b_size = 1;
size_t c_size = 1;
// Tuner-specific arguments
double fraction = 1.0;
// Client-specific arguments
@ -123,6 +130,15 @@ struct Arguments {
bool no_abbrv = false;
};
// Structure containing all possible buffers for test clients
struct Buffers {
Buffer x_vec;
Buffer y_vec;
Buffer a_mat;
Buffer b_mat;
Buffer c_mat;
};
// =================================================================================================
// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast

View file

@ -18,14 +18,20 @@
#include "clblast.h"
// BLAS level-1 includes
#include "internal/routines/xaxpy.h"
#include "internal/routines/level1/xaxpy.h"
// BLAS level-2 includes
#include "internal/routines/xgemv.h"
#include "internal/routines/level2/xgemv.h"
// BLAS level-3 includes
#include "internal/routines/xgemm.h"
#include "internal/routines/xsymm.h"
#include "internal/routines/level3/xgemm.h"
#include "internal/routines/level3/xsymm.h"
#include "internal/routines/level3/xhemm.h"
#include "internal/routines/level3/xsyrk.h"
#include "internal/routines/level3/xherk.h"
#include "internal/routines/level3/xsyr2k.h"
#include "internal/routines/level3/xher2k.h"
#include "internal/routines/level3/xtrmm.h"
namespace clblast {
// =================================================================================================
@ -41,10 +47,8 @@ StatusCode Axpy(const size_t n, const T alpha,
auto event_cpp = Event(*event);
auto routine = Xaxpy<T>(queue_cpp, event_cpp);
// Loads the kernel source-code as an include (C++11 raw string literal)
std::string kernel_source =
#include "kernels/xaxpy.opencl"
auto status = routine.SetUp(kernel_source);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
@ -74,7 +78,7 @@ template StatusCode Axpy<double2>(const size_t, const double2,
// GEMV
template <typename T>
StatusCode Gemv(const Layout layout, const Transpose transpose_a,
StatusCode Gemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta,
@ -85,14 +89,12 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
auto event_cpp = Event(*event);
auto routine = Xgemv<T>(queue_cpp, event_cpp);
// Loads the kernel source-code as an include (C++11 raw string literal)
std::string kernel_source =
#include "kernels/xgemv.opencl"
auto status = routine.SetUp(kernel_source);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoGemv(layout, transpose_a, m, n, alpha,
return routine.DoGemv(layout, a_transpose, m, n, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(x_buffer), x_offset, x_inc, beta,
Buffer(y_buffer), y_offset, y_inc);
@ -127,7 +129,7 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,
// GEMM
template <typename T>
StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
@ -137,23 +139,12 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
auto event_cpp = Event(*event);
auto routine = Xgemm<T>(queue_cpp, event_cpp);
// Loads the kernel source-code as an include (C++11 raw string literal)
std::string common_source1 =
#include "kernels/copy.opencl"
std::string common_source2 =
#include "kernels/pad.opencl"
std::string common_source3 =
#include "kernels/transpose.opencl"
std::string common_source4 =
#include "kernels/padtranspose.opencl"
std::string kernel_source =
#include "kernels/xgemm.opencl"
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
kernel_source);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoGemm(layout, transpose_a, transpose_b, m, n, k, alpha,
return routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld, beta,
Buffer(c_buffer), c_offset, c_ld);
@ -197,19 +188,8 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
auto event_cpp = Event(*event);
auto routine = Xsymm<T>(queue_cpp, event_cpp);
// Loads the kernel source-code as an include (C++11 raw string literal)
std::string common_source1 =
#include "kernels/copy.opencl"
std::string common_source2 =
#include "kernels/pad.opencl"
std::string common_source3 =
#include "kernels/transpose.opencl"
std::string common_source4 =
#include "kernels/padtranspose.opencl"
std::string kernel_source =
#include "kernels/xgemm.opencl"
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
kernel_source);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
@ -244,4 +224,302 @@ template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
cl_command_queue*, cl_event*);
// =================================================================================================
// HEMM
template <typename T>
StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto routine = Xhemm<T>(queue_cpp, event_cpp);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoHemm(layout, side, triangle, m, n, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld, beta,
Buffer(c_buffer), c_offset, c_ld);
}
template StatusCode Hemm<float2>(const Layout, const Side, const Triangle,
const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const float2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Hemm<double2>(const Layout, const Side, const Triangle,
const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// SYRK
template <typename T>
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto routine = Xsyrk<T>(queue_cpp, event_cpp);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha,
Buffer(a_buffer), a_offset, a_ld, beta,
Buffer(c_buffer), c_offset, c_ld);
}
template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float,
const cl_mem, const size_t, const size_t, const float,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syrk<double>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const double,
const cl_mem, const size_t, const size_t, const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syrk<float2>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t, const float2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syrk<double2>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t, const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// HERK
template <typename T>
StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto routine = Xherk<std::complex<T>,T>(queue_cpp, event_cpp);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoHerk(layout, triangle, a_transpose, n, k, alpha,
Buffer(a_buffer), a_offset, a_ld, beta,
Buffer(c_buffer), c_offset, c_ld);
}
template StatusCode Herk<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float,
const cl_mem, const size_t, const size_t, const float,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Herk<double>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const double,
const cl_mem, const size_t, const size_t, const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// SYR2K
template <typename T>
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto routine = Xsyr2k<T>(queue_cpp, event_cpp);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld, beta,
Buffer(c_buffer), c_offset, c_ld);
}
template StatusCode Syr2k<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const float,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syr2k<double>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const double,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syr2k<float2>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const float2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Syr2k<double2>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// SYR2K
template <typename T, typename U>
StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k, const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto routine = Xher2k<T,U>(queue_cpp, event_cpp);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld, beta,
Buffer(c_buffer), c_offset, c_ld);
}
template StatusCode Her2k<float2,float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const float,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Her2k<double2,double>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t, const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// TRMM
template <typename T>
StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto routine = Xtrmm<T>(queue_cpp, event_cpp);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld);
}
template StatusCode Trmm<float>(const Layout, const Side, const Triangle,
const Transpose, const Diagonal,
const size_t, const size_t, const float,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Trmm<double>(const Layout, const Side, const Triangle,
const Transpose, const Diagonal,
const size_t, const size_t, const double,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Trmm<float2>(const Layout, const Side, const Triangle,
const Transpose, const Diagonal,
const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Trmm<double2>(const Layout, const Side, const Triangle,
const Transpose, const Diagonal,
const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// TRSM
/*
template <typename T>
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = CommandQueue(*queue);
auto event_cpp = Event(*event);
auto routine = Xtrsm<T>(queue_cpp, event_cpp);
// Compiles the routine's device kernels
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
// Runs the routine
return routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
Buffer(a_buffer), a_offset, a_ld,
Buffer(b_buffer), b_offset, b_ld);
}
template StatusCode Trsm<float>(const Layout, const Side, const Triangle,
const Transpose, const Diagonal,
const size_t, const size_t, const float,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Trsm<double>(const Layout, const Side, const Triangle,
const Transpose, const Diagonal,
const size_t, const size_t, const double,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Trsm<float2>(const Layout, const Side, const Triangle,
const Transpose, const Diagonal,
const size_t, const size_t, const float2,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode Trsm<double2>(const Layout, const Side, const Triangle,
const Transpose, const Diagonal,
const size_t, const size_t, const double2,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
*/
// =================================================================================================
} // namespace clblast

View file

@ -39,6 +39,7 @@ R"(
typedef float8 real8;
typedef float16 real16;
#define ZERO 0.0f
#define ONE 1.0f
// Double-precision
#elif PRECISION == 64
@ -48,6 +49,7 @@ R"(
typedef double8 real8;
typedef double16 real16;
#define ZERO 0.0
#define ONE 1.0
// Complex single-precision
#elif PRECISION == 3232
@ -61,6 +63,7 @@ R"(
real s8; real s9; real sA; real sB;
real sC; real sD; real sE; real sF;} real16;
#define ZERO 0.0f
#define ONE 1.0f
// Complex Double-precision
#elif PRECISION == 6464
@ -74,12 +77,16 @@ R"(
real s8; real s9; real sA; real sB;
real sC; real sD; real sE; real sF;} real16;
#define ZERO 0.0
#define ONE 1.0
#endif
// =================================================================================================
// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction
#define USE_CL_MAD 0
// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
// devices, this is enabled (see src/routine.cc).
#ifndef USE_CL_MAD
#define USE_CL_MAD 0
#endif
// Sets a variable to zero
#if PRECISION == 3232 || PRECISION == 6464
@ -88,6 +95,20 @@ R"(
#define SetToZero(a) a = ZERO
#endif
// Sets a variable to zero (only the imaginary part)
#if PRECISION == 3232 || PRECISION == 6464
#define ImagToZero(a) a.y = ZERO
#else
#define ImagToZero(a)
#endif
// Sets a variable to one
#if PRECISION == 3232 || PRECISION == 6464
#define SetToOne(a) a.x = ONE; a.y = ZERO
#else
#define SetToOne(a) a = ONE
#endif
// Multiply two complex variables (used in the define below)
#if PRECISION == 3232 || PRECISION == 6464
#define MulReal(a, b) a.x*b.x - a.y*b.y
@ -122,6 +143,6 @@ R"(
// =================================================================================================
// End of the C++11 raw string literal
)";
)"
// =================================================================================================

View file

@ -68,6 +68,6 @@ __kernel void CopyMatrix(const int ld,
// =================================================================================================
// End of the C++11 raw string literal
)";
)"
// =================================================================================================

View file

@ -86,7 +86,9 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest) {
__global real* dest,
const int upper, const int lower,
const int diagonal_imag_zero) {
// Loops over the work per thread in both dimensions
#pragma unroll
@ -95,11 +97,20 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
#pragma unroll
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_two && id_one < dest_one) {
// Masking in case of triangular matrices: updates only the upper or lower part
bool condition = true;
if (upper == 1) { condition = (id_two >= id_one); }
else if (lower == 1) { condition = (id_two <= id_one); }
if (condition) {
// Copies the value into the destination matrix. This is always within bounds of the source
// matrix, as we know that the destination matrix is smaller than the source.
dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
if (id_two < dest_two && id_one < dest_one) {
real value = src[id_two*src_ld + id_one + src_offset];
if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
dest[id_two*dest_ld + id_one + dest_offset] = value;
}
}
}
}
@ -127,15 +138,15 @@ __kernel void SymmLowerToSquared(const int src_dim,
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the lower-symmetric matrix
real value;
SetToZero(value);
real result;
SetToZero(result);
if (id_two < src_dim && id_one < src_dim) {
if (id_two <= id_one) { value = src[id_two*src_ld + id_one + src_offset]; }
else { value = src[id_one*src_ld + id_two + src_offset]; }
if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
else { result = src[id_one*src_ld + id_two + src_offset]; }
}
// Stores the value in the destination matrix
dest[id_two*dest_ld + id_one + dest_offset] = value;
// Stores the result in the destination matrix
dest[id_two*dest_ld + id_one + dest_offset] = result;
}
}
}
@ -160,15 +171,171 @@ __kernel void SymmUpperToSquared(const int src_dim,
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the upper-symmetric matrix
real value;
SetToZero(value);
real result;
SetToZero(result);
if (id_two < src_dim && id_one < src_dim) {
if (id_one <= id_two) { value = src[id_two*src_ld + id_one + src_offset]; }
else { value = src[id_one*src_ld + id_two + src_offset]; }
if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
else { result = src[id_one*src_ld + id_two + src_offset]; }
}
// Stores the value in the destination matrix
dest[id_two*dest_ld + id_one + dest_offset] = value;
// Stores the result in the destination matrix
dest[id_two*dest_ld + id_one + dest_offset] = result;
}
}
}
}
// =================================================================================================
#if PRECISION == 3232 || PRECISION == 6464
// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void HermLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest) {
// Loops over the work per thread in both dimensions
#pragma unroll
for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
#pragma unroll
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the lower-hermitian matrix
real result;
SetToZero(result);
if (id_two < src_dim && id_one < src_dim) {
if (id_two <= id_one) {
result = src[id_two*src_ld + id_one + src_offset];
if (id_one == id_two) { result.y = ZERO; }
}
else {
result = src[id_one*src_ld + id_two + src_offset];
COMPLEX_CONJUGATE(result);
}
}
// Stores the result in the destination matrix
dest[id_two*dest_ld + id_one + dest_offset] = result;
}
}
}
}
// Same as above, but now the matrix' data is stored in the upper-triangle
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void HermUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest) {
// Loops over the work per thread in both dimensions
#pragma unroll
for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
#pragma unroll
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the upper-hermitian matrix
real result;
SetToZero(result);
if (id_two < src_dim && id_one < src_dim) {
if (id_one <= id_two) {
result = src[id_two*src_ld + id_one + src_offset];
if (id_one == id_two) { result.y = ZERO; }
}
else {
result = src[id_one*src_ld + id_two + src_offset];
COMPLEX_CONJUGATE(result);
}
}
// Stores the result in the destination matrix
dest[id_two*dest_ld + id_one + dest_offset] = result;
}
}
}
}
#endif
// =================================================================================================
// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void TrmmLowerToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest,
const int unit_diagonal) {
// Loops over the work per thread in both dimensions
#pragma unroll
for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
#pragma unroll
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the lower-triangular matrix
real result;
SetToZero(result);
if (id_two < src_dim && id_one < src_dim) {
if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
if (id_two == id_one && unit_diagonal) { SetToOne(result); }
// Else: result is zero
}
// Stores the result in the destination matrix
dest[id_two*dest_ld + id_one + dest_offset] = result;
}
}
}
}
// Same as above, but now the matrix' data is stored in the upper-triangle
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
__kernel void TrmmUpperToSquared(const int src_dim,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_dim,
const int dest_ld, const int dest_offset,
__global real* dest,
const int unit_diagonal) {
// Loops over the work per thread in both dimensions
#pragma unroll
for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
#pragma unroll
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_dim && id_one < dest_dim) {
// Loads data from the upper-triangular matrix
real result;
SetToZero(result);
if (id_two < src_dim && id_one < src_dim) {
if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
if (id_one == id_two && unit_diagonal) { SetToOne(result); }
// Else: result is zero
}
// Stores the result in the destination matrix
dest[id_two*dest_ld + id_one + dest_offset] = result;
}
}
}
@ -177,6 +344,6 @@ __kernel void SymmUpperToSquared(const int src_dim,
// =================================================================================================
// End of the C++11 raw string literal
)";
)"
// =================================================================================================

View file

@ -100,7 +100,9 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest) {
__global real* dest,
const int upper, const int lower,
const int diagonal_imag_zero) {
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@ -137,10 +139,18 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
// Stores the transposed value in the destination matrix
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
// Masking in case of triangular matrices: updates only the upper or lower part
bool condition = true;
if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
if (condition) {
// Stores the transposed value in the destination matrix
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
}
}
}
}
@ -149,6 +159,6 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
// =================================================================================================
// End of the C++11 raw string literal
)";
)"
// =================================================================================================

View file

@ -20,13 +20,16 @@ R"(
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef TRA_DIM
#define TRA_DIM 8 // Number of local threads in the two dimensions (x,y)
#define TRA_DIM 8 // Number of local threads in the two dimensions (x,y)
#endif
#ifndef TRA_WPT
#define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other
#define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other
#endif
#ifndef TRA_PAD
#define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts
#define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts
#endif
#ifndef TRA_SHUFFLE
#define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts
#endif
// =================================================================================================
@ -53,116 +56,94 @@ __kernel void TransposeMatrix(const int ld,
__global const realT* restrict src,
__global realT* dest) {
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[TRA_WPT*TRA_DIM][TRA_WPT*TRA_DIM + TRA_PAD];
// Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
// way over workgroups, breaking memory-bank dependencies.
const int gid0 = get_group_id(0);
#if TRA_SHUFFLE == 1
const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
#else
const int gid1 = get_group_id(1);
#endif
// Loop over the work per thread
// Local memory to store a tile of the matrix (for coalescing)
__local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD];
// Loops over the work per thread
#pragma unroll
for (int w_one=0; w_one<TRA_WPT; ++w_one) {
// Computes the identifiers for the source matrix. Note that the local and global dimensions
// do not correspond to each other!
const int id_one = get_group_id(1) * TRA_DIM + get_local_id(0);
const int id_two = (get_group_id(0) * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
const int id_one = gid1 * TRA_DIM + get_local_id(0);
const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
// Loads data into the local memory
realT value = src[id_two*(ld/TRA_WPT) + id_one];
#if TRA_WPT == 1
tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value;
#elif TRA_WPT == 2
tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
#elif TRA_WPT == 4
tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.z;
tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.w;
#elif TRA_WPT == 8
tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
#elif TRA_WPT == 16
tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
tile[get_local_id(1)*TRA_WPT + 8][get_local_id(0)*TRA_WPT + w_one] = value.s8;
tile[get_local_id(1)*TRA_WPT + 9][get_local_id(0)*TRA_WPT + w_one] = value.s9;
tile[get_local_id(1)*TRA_WPT + 10][get_local_id(0)*TRA_WPT + w_one] = value.sA;
tile[get_local_id(1)*TRA_WPT + 11][get_local_id(0)*TRA_WPT + w_one] = value.sB;
tile[get_local_id(1)*TRA_WPT + 12][get_local_id(0)*TRA_WPT + w_one] = value.sC;
tile[get_local_id(1)*TRA_WPT + 13][get_local_id(0)*TRA_WPT + w_one] = value.sD;
tile[get_local_id(1)*TRA_WPT + 14][get_local_id(0)*TRA_WPT + w_one] = value.sE;
tile[get_local_id(1)*TRA_WPT + 15][get_local_id(0)*TRA_WPT + w_one] = value.sF;
#endif
tile[get_local_id(0)*TRA_WPT + w_one][get_local_id(1)] = value;
}
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
// Loop over the work per thread
// Loads transposed data from the local memory
realT v[TRA_WPT];
#pragma unroll
for (int w_one=0; w_one<TRA_WPT; ++w_one) {
v[w_one] = tile[get_local_id(1)*TRA_WPT + w_one][get_local_id(0)];
}
// Performs the register-level transpose of the vectorized data
realT results[TRA_WPT];
#if TRA_WPT == 1
results[0] = v[0];
#elif TRA_WPT == 2
results[0] = (realT) (v[0].x, v[1].x);
results[1] = (realT) (v[0].y, v[1].y);
#elif TRA_WPT == 4
results[0] = (realT) (v[0].x, v[1].x, v[2].x, v[3].x);
results[1] = (realT) (v[0].y, v[1].y, v[2].y, v[3].y);
results[2] = (realT) (v[0].z, v[1].z, v[2].z, v[3].z);
results[3] = (realT) (v[0].w, v[1].w, v[2].w, v[3].w);
#elif TRA_WPT == 8
results[0] = (realT) (v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0);
results[1] = (realT) (v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1);
results[2] = (realT) (v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2);
results[3] = (realT) (v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3);
results[4] = (realT) (v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4);
results[5] = (realT) (v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5);
results[6] = (realT) (v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6);
results[7] = (realT) (v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7);
#elif TRA_WPT == 16
results[ 0] = (realT) (v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0);
results[ 1] = (realT) (v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1);
results[ 2] = (realT) (v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2);
results[ 3] = (realT) (v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3);
results[ 4] = (realT) (v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4);
results[ 5] = (realT) (v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5);
results[ 6] = (realT) (v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6);
results[ 7] = (realT) (v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7);
results[ 8] = (realT) (v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8);
results[ 9] = (realT) (v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9);
results[10] = (realT) (v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA);
results[11] = (realT) (v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB);
results[12] = (realT) (v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC);
results[13] = (realT) (v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD);
results[14] = (realT) (v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE);
results[15] = (realT) (v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF);
#endif
// Stores the results into the destination matrix
#pragma unroll
for (int w_two=0; w_two<TRA_WPT; ++w_two) {
// Computes the identifiers for the destination matrix
const int id_one = get_global_id(0);
const int id_two = get_global_id(1)*TRA_WPT + w_two;
// Stores the transposed value in the destination matrix
realT value;
#if TRA_WPT == 1
value = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
#elif TRA_WPT == 2
value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
#elif TRA_WPT == 4
value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
value.z = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
value.w = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
#elif TRA_WPT == 8
value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
#elif TRA_WPT == 16
value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
value.s8 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 8];
value.s9 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 9];
value.sA = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 10];
value.sB = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 11];
value.sC = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 12];
value.sD = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 13];
value.sE = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 14];
value.sF = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 15];
#endif
dest[id_two*(ld/TRA_WPT) + id_one] = value;
const int id_one = gid0*TRA_DIM + get_local_id(0);
const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
}
}
// =================================================================================================
// End of the C++11 raw string literal
)";
)"
// =================================================================================================

View file

@ -123,6 +123,6 @@ __kernel void XaxpyFast(const int n, const real alpha,
// =================================================================================================
// End of the C++11 raw string literal
)";
)"
// =================================================================================================

View file

@ -127,6 +127,55 @@ R"(
// =================================================================================================
// Initializes the accumulation registers to zero
inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#pragma unroll
for (int ni=0; ni<NWI; ++ni) {
#if VWM == 1
SetToZero(cpm[ni][mi]);
#elif VWM == 2
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
#elif VWM == 4
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
SetToZero(cpm[ni][mi].z);
SetToZero(cpm[ni][mi].w);
#elif VWM == 8
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
#elif VWM == 16
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
SetToZero(cpm[ni][mi].s8);
SetToZero(cpm[ni][mi].s9);
SetToZero(cpm[ni][mi].sA);
SetToZero(cpm[ni][mi].sB);
SetToZero(cpm[ni][mi].sC);
SetToZero(cpm[ni][mi].sD);
SetToZero(cpm[ni][mi].sE);
SetToZero(cpm[ni][mi].sF);
#endif
}
}
}
// =================================================================================================
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix.
#if SA == 1
@ -272,71 +321,6 @@ inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg
// =================================================================================================
// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
const real alpha, const real beta) {
#pragma unroll
for (int ni=0; ni<NWI; ++ni) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#if STRM == 0
int mg = mi + get_local_id(0)*(MWI/VWM);
#elif STRM == 1
int mg = get_local_id(0) + mi*MDIMC;
#endif
#if STRN == 0
int ng = ni + get_local_id(1)*NWI;
#elif STRN == 1
int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
#endif
int idm = mg + get_group_id(0)*(MWG/VWM);
int idn = ng + get_group_id(1)*NWG;
int index = idn*(kSizeM/VWM) + idm;
realM cval = cgm[index];
#if VWM == 1
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
#elif VWM == 2
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
#elif VWM == 4
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
#elif VWM == 8
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
#elif VWM == 16
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
#endif
}
}
}
// =================================================================================================
// The vectorised multiply-add function
inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
#if USE_VECTOR_MAD == 1
@ -432,77 +416,97 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real
// =================================================================================================
// Main entry of the kernel. This function contains the basic skeleton, the functionality is
// provided by the inlined functions above
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
const real alpha, const real beta) {
#pragma unroll
for (int ni=0; ni<NWI; ++ni) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#if STRM == 0
int mg = mi + get_local_id(0)*(MWI/VWM);
#elif STRM == 1
int mg = get_local_id(0) + mi*MDIMC;
#endif
#if STRN == 0
int ng = ni + get_local_id(1)*NWI;
#elif STRN == 1
int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
#endif
int idm = mg + get_group_id(0)*(MWG/VWM);
int idn = ng + get_group_id(1)*NWG;
// Combined thread identifier
// The final multiplication with alpha and the addition with beta*C
int index = idn*(kSizeM/VWM) + idm;
realM cval = cgm[index];
#if VWM == 1
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
#elif VWM == 2
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
#elif VWM == 4
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
#elif VWM == 8
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
#elif VWM == 16
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
#endif
}
}
}
// =================================================================================================
// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
const __global realM* restrict agm, const __global realN* restrict bgm,
__global realM* cgm, realM cpm[NWI][MWI/VWM]
#if SA == 1 && SB == 1
, __local realM* alm, __local realN* blm
#elif SA == 1
, __local realM* alm
#elif SB == 1
, __local realN* blm
#endif
) {
// Allocates workitem-private memory (registers)
realM apm[MWI/VWM];
realN bpm[NWI/VWN];
// Combined thread identifier (volatile to disable caching)
#if SA == 1 || SB == 1
volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
#endif
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Allocates workitem-private memory (registers)
realM apm[MWI/VWM];
realN bpm[NWI/VWN];
realM cpm[NWI][MWI/VWM];
// Initializes the accumulation registers
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#pragma unroll
for (int ni=0; ni<NWI; ++ni) {
#if VWM == 1
SetToZero(cpm[ni][mi]);
#elif VWM == 2
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
#elif VWM == 4
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
SetToZero(cpm[ni][mi].z);
SetToZero(cpm[ni][mi].w);
#elif VWM == 8
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
#elif VWM == 16
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
SetToZero(cpm[ni][mi].s8);
SetToZero(cpm[ni][mi].s9);
SetToZero(cpm[ni][mi].sA);
SetToZero(cpm[ni][mi].sB);
SetToZero(cpm[ni][mi].sC);
SetToZero(cpm[ni][mi].sD);
SetToZero(cpm[ni][mi].sE);
SetToZero(cpm[ni][mi].sF);
#endif
}
}
InitAccRegisters(cpm);
// Loops over all workgroup tiles
for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
@ -515,8 +519,6 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
#if SB == 1
GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
#endif
// Synchronizes all threads in a workgroup
#if SA == 1 || SB == 1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
@ -552,20 +554,130 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
MultiplyAccumulate(cpm, apm, bpm);
}
}
// Synchronizes all threads in a workgroup
#if SA == 1 || SB == 1
barrier(CLK_LOCAL_MEM_FENCE);
#endif
}
// Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeM, alpha, beta);
}
// =================================================================================================
// The upper-triangular and lower-triangular kernels are only used in special cases
#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
// End of the C++11 raw string literal
)";
// Main entry point of the kernel. This is the upper-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
// Skip these threads if they do not contain threads contributing to the upper-triangle
if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
return;
}
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeN, alpha, beta);
}
// Main entry point of the kernel. This is the lower-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void XgemmLower(const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
// Skip these threads if they do not contain threads contributing to the lower-triangle
if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
return;
}
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeN, alpha, beta);
}
// =================================================================================================
// If not using a triangular version, include the regular kernel
#else
// Main entry point of the kernel. This is the regular full version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm, cpm, kSizeM, alpha, beta);
}
#endif
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -368,6 +368,6 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea
// =================================================================================================
// End of the C++11 raw string literal
)";
)"
// =================================================================================================

View file

@ -22,9 +22,10 @@ namespace clblast {
std::vector<Routine::ProgramCache> Routine::program_cache_;
// Constructor: not much here, because no status codes can be returned
Routine::Routine(CommandQueue &queue, Event &event,
Routine::Routine(CommandQueue &queue, Event &event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision):
precision_(precision),
routine_name_(name),
queue_(queue),
event_(event),
context_(queue_.GetContext()),
@ -33,14 +34,13 @@ Routine::Routine(CommandQueue &queue, Event &event,
max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
max_work_item_sizes_(device_.MaxWorkItemSizes()),
max_work_group_size_(device_.MaxWorkGroupSize()),
db_(queue_, routines, precision_),
routines_(routines) {
db_(queue_, routines, precision_) {
}
// =================================================================================================
// Separate set-up function to allow for status codes to be returned
StatusCode Routine::SetUp(const std::string &routine_source) {
StatusCode Routine::SetUp() {
// Queries the cache to see whether or not the compiled kernel is already there. If not, it will
// be built and added to the cache.
@ -63,12 +63,24 @@ StatusCode Routine::SetUp(const std::string &routine_source) {
// Loads the common header (typedefs and defines and such)
std::string common_header =
#include "kernels/common.opencl"
#include "kernels/common.opencl"
;
// Collects the parameters for this device in the form of defines, and adds the precision
auto defines = db_.GetDefines();
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
auto source_string = defines + common_header + routine_source;
// Adds the name of the routine as a define
defines += "#define ROUTINE_"+routine_name_+"\n";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.Vendor() == "AMD") {
defines += "#define USE_CL_MAD 1\n";
}
// Combines everything together into a single source string
auto source_string = defines + common_header + source_string_;
// Compiles the kernel
try {
@ -85,7 +97,7 @@ StatusCode Routine::SetUp(const std::string &routine_source) {
if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; }
// Store the compiled program in the cache
program_cache_.push_back({program, device_name_, precision_, routines_});
program_cache_.push_back({program, device_name_, precision_, routine_name_});
} catch (...) { return StatusCode::kBuildProgramFailure; }
}
@ -202,19 +214,22 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size
// =================================================================================================
// Copies a matrix and pads it with zeros
// Copies or transposes a matrix and pads/unpads it with zeros
StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer &dest,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool pad, const Program &program) {
const bool upper, const bool lower,
const bool diagonal_imag_zero) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
(upper == false) && (lower == false) && (diagonal_imag_zero == false);
// Determines the right kernel
auto kernel_name = std::string{};
@ -227,7 +242,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
}
else {
use_fast_kernel = false;
kernel_name = (pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
kernel_name = (do_pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
}
}
else {
@ -239,7 +254,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
}
else {
use_fast_kernel = false;
kernel_name = (pad) ? "PadMatrix" : "UnPadMatrix";
kernel_name = (do_pad) ? "PadMatrix" : "UnPadMatrix";
}
}
@ -264,9 +279,14 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
if (pad) {
if (do_pad) {
kernel.SetArgument(10, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(10, static_cast<int>(upper));
kernel.SetArgument(11, static_cast<int>(lower));
kernel.SetArgument(12, static_cast<int>(diagonal_imag_zero));
}
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
@ -310,7 +330,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
// otherwise.
const Program& Routine::GetProgramFromCache() const {
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(device_name_, precision_, routines_)) {
if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) {
return cached_program.program;
}
}
@ -320,7 +340,7 @@ const Program& Routine::GetProgramFromCache() const {
// Queries the cache to see whether or not the compiled kernel is already there
bool Routine::ProgramIsInCache() const {
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(device_name_, precision_, routines_)) { return true; }
if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; }
}
return false;
}

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/xaxpy.h"
#include "internal/routines/level1/xaxpy.h"
#include <string>
#include <vector>
@ -30,7 +30,10 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
Routine(queue, event, {"Xaxpy"}, precision_) {
Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) {
source_string_ =
#include "../../kernels/xaxpy.opencl"
;
}
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/xgemv.h"
#include "internal/routines/level2/xgemv.h"
#include <string>
#include <vector>
@ -30,7 +30,10 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
Routine(queue, event, {"Xgemv"}, precision_) {
Routine(queue, event, "GEMV", {"Xgemv"}, precision_) {
source_string_ =
#include "../../kernels/xgemv.opencl"
;
}
// =================================================================================================

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/xgemm.h"
#include "internal/routines/level3/xgemm.h"
#include <string>
#include <vector>
@ -30,7 +30,14 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/copy.opencl"
#include "../../kernels/pad.opencl"
#include "../../kernels/transpose.opencl"
#include "../../kernels/padtranspose.opencl"
#include "../../kernels/xgemm.opencl"
;
}
// =================================================================================================
@ -95,31 +102,48 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
auto n_ceiled = Ceil(n, db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Allocates space on the device for padded and/or transposed input and output matrices.
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
// Loads the program from the database
auto& program = GetProgramFromCache();
// Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
// them up until they reach a certain multiple of size (kernel parameter dependent).
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
a_do_transpose, a_conjugate, true, program);
if (ErrorIn(status)) { return status; }
status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
b_do_transpose, b_conjugate, true, program);
if (ErrorIn(status)) { return status; }
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
a_do_transpose == false && a_conjugate == false;
auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
b_do_transpose == false && b_conjugate == false;
auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 &&
c_do_transpose == false;
// Only necessary for matrix C if it used both as input and output
if (beta != static_cast<T>(0)) {
// Creates the temporary matrices
auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
program, true, a_do_transpose, a_conjugate);
if (ErrorIn(status)) { return status; }
}
// As above, but now for matrix B
if (!b_no_temp) {
status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
program, true, b_do_transpose, b_conjugate);
if (ErrorIn(status)) { return status; }
}
// As above, but now for matrix C. This is only necessary if C is used both as input and output.
if (!c_no_temp && beta != static_cast<T>(0)) {
status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
c_do_transpose, false, true, program);
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
program, true, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
}
@ -133,9 +157,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
kernel.SetArgument(2, static_cast<int>(k_ceiled));
kernel.SetArgument(3, alpha);
kernel.SetArgument(4, beta);
kernel.SetArgument(5, temp_a());
kernel.SetArgument(6, temp_b());
kernel.SetArgument(7, temp_c());
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, b_temp());
kernel.SetArgument(7, c_temp());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
@ -148,11 +172,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel
status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
c_one, c_two, c_ld, c_offset, c_buffer,
c_do_transpose, false, false, program);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel if needed
if (!c_no_temp) {
status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
program, false, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
}
// Successfully finished the computation
return StatusCode::kSuccess;

View file

@ -0,0 +1,130 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xhemm class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level3/xhemm.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xhemm<T>::Xhemm(CommandQueue &queue, Event &event):
Xgemm<T>(queue, event) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
// Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
// left) or B (on the right) in the Xgemm routine.
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the squared A matrix
auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
// Temporary buffer for a copy of the hermitian matrix
try {
auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
// Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
// routine afterwards
try {
auto& program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Sets the arguments for the hermitian-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));
kernel.SetArgument(1, static_cast<int>(a_ld));
kernel.SetArgument(2, static_cast<int>(a_offset));
kernel.SetArgument(3, a_buffer());
kernel.SetArgument(4, static_cast<int>(k));
kernel.SetArgument(5, static_cast<int>(k));
kernel.SetArgument(6, static_cast<int>(0));
kernel.SetArgument(7, temp_herm());
// Uses the common padding kernel's thread configuration. This is allowed, since the
// hermitian-to-squared kernel uses the same parameters.
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Runs the regular Xgemm code with either "C := AB+C" or ...
if (side == Side::kLeft) {
status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
m, n, k,
alpha,
temp_herm, 0, k,
b_buffer, b_offset, b_ld,
beta,
c_buffer, c_offset, c_ld);
}
// ... with "C := BA+C". Note that A and B are now reversed.
else {
status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
m, n, k,
alpha,
b_buffer, b_offset, b_ld,
temp_herm, 0, k,
beta,
c_buffer, c_offset, c_ld);
// A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
switch(status) {
case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
}
}
// Return the status of the Xgemm routine
return status;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
// =================================================================================================
// Compiles the templated class
template class Xhemm<float2>;
template class Xhemm<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,207 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xher2k class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level3/xher2k.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xher2k<float2,float>::precision_ = Precision::kComplexSingle;
template <> const Precision Xher2k<double2,double>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event):
Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/copy.opencl"
#include "../../kernels/pad.opencl"
#include "../../kernels/transpose.opencl"
#include "../../kernels/padtranspose.opencl"
#include "../../kernels/xgemm.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T, typename U>
StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
const U beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
// Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
// to matrix A (argument: conjugate transpose)
auto ab_conjugate = (ab_transpose != Transpose::kNo);
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) ||
(layout == Layout::kRowMajor && !ab_conjugate);
auto c_rotated = (layout == Layout::kRowMajor);
// Computes the first and second dimensions of the A and B matrices taking the layout into account
auto ab_one = (ab_rotated) ? k : n;
auto ab_two = (ab_rotated) ? n : k;
// Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(n, db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
// Loads the program from the database
auto& program = GetProgramFromCache();
// Determines whether or not temporary matrices are needed
auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false && ab_conjugate == false;
auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false && ab_conjugate == true;
auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
ab_rotated == false && ab_conjugate == false;
auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
ab_rotated == false && ab_conjugate == true;
// Creates the temporary matrices
auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a1_no_temp) {
status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
program, true, ab_rotated, ab_conjugate);
if (ErrorIn(status)) { return status; }
}
if (!a2_no_temp) {
status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
program, true, ab_rotated, !ab_conjugate);
if (ErrorIn(status)) { return status; }
}
if (!b1_no_temp) {
status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
program, true, ab_rotated, ab_conjugate);
if (ErrorIn(status)) { return status; }
}
if (!b2_no_temp) {
status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
program, true, ab_rotated, !ab_conjugate);
if (ErrorIn(status)) { return status; }
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
program, true, c_rotated, false);
if (ErrorIn(status)) { return status; }
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
auto complex_beta = T{beta, static_cast<U>(0.0)};
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, complex_beta);
kernel.SetArgument(4, a1_temp());
kernel.SetArgument(5, b2_temp());
kernel.SetArgument(6, c_temp());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
kernel.SetArgument(2, conjugate_alpha);
kernel.SetArgument(3, complex_one);
kernel.SetArgument(4, b1_temp());
kernel.SetArgument(5, a2_temp());
// Runs the kernel again
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
program, false, c_rotated, false, upper, lower, true);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
// =================================================================================================
// Compiles the templated class
template class Xher2k<float2,float>;
template class Xher2k<double2,double>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,175 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xherk class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level3/xherk.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xherk<float2,float>::precision_ = Precision::kComplexSingle;
template <> const Precision Xherk<double2,double>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xherk<T,U>::Xherk(CommandQueue &queue, Event &event):
Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/copy.opencl"
#include "../../kernels/pad.opencl"
#include "../../kernels/transpose.opencl"
#include "../../kernels/padtranspose.opencl"
#include "../../kernels/xgemm.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T, typename U>
StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const U alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const U beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
// Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
// to matrix A (argument: conjugate transpose)
auto a_conjugate = (a_transpose != Transpose::kNo);
auto b_conjugate = (a_transpose == Transpose::kNo);
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
auto a_rotated = (layout == Layout::kColMajor && a_conjugate) ||
(layout == Layout::kRowMajor && !a_conjugate);
auto c_rotated = (layout == Layout::kRowMajor);
// Computes the first and second dimensions of the A matrix taking the layout into account
auto a_one = (a_rotated) ? k : n;
auto a_two = (a_rotated) ? n : k;
// Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(n, db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
// Loads the program from the database
auto& program = GetProgramFromCache();
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false && a_conjugate == false;
auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false && b_conjugate == false;
// Creates the temporary matrices
auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped. Two copies are created.
if (!a_no_temp) {
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
program, true, a_rotated, a_conjugate);
if (ErrorIn(status)) { return status; }
}
if (!b_no_temp) {
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
program, true, a_rotated, b_conjugate);
if (ErrorIn(status)) { return status; }
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
program, true, c_rotated, false);
if (ErrorIn(status)) { return status; }
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
auto complex_alpha = T{alpha, static_cast<U>(0.0)};
auto complex_beta = T{beta, static_cast<U>(0.0)};
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, complex_alpha);
kernel.SetArgument(3, complex_beta);
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
program, false, c_rotated, false, upper, lower, true);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
// =================================================================================================
// Compiles the templated class
template class Xherk<float2,float>;
template class Xherk<double2,double>;
// =================================================================================================
} // namespace clblast

View file

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "internal/routines/xsymm.h"
#include "internal/routines/level3/xsymm.h"
#include <string>
#include <vector>
@ -42,14 +42,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
// Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
// left) or B (on the right) in the Xgemm routine.
size_t k = (side == Side::kLeft) ? m : n;
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the squared A matrix
auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the symmetrix matrix
// default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
@ -75,7 +75,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
kernel.SetArgument(7, temp_symm());
// Uses the common padding kernel's thread configuration. This is allowed, since the
// symmetry-to-squared kernel uses the same parameters.
// symmetric-to-squared kernel uses the same parameters.
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};

View file

@ -0,0 +1,186 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyr2k class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level3/xsyr2k.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/copy.opencl"
#include "../../kernels/pad.opencl"
#include "../../kernels/transpose.opencl"
#include "../../kernels/padtranspose.opencl"
#include "../../kernels/xgemm.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
(layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
auto c_rotated = (layout == Layout::kRowMajor);
// Computes the first and second dimensions of the A and B matrices taking the layout into account
auto ab_one = (ab_rotated) ? k : n;
auto ab_two = (ab_rotated) ? n : k;
// Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix B cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(n, db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
// Loads the program from the database
auto& program = GetProgramFromCache();
// Determines whether or not temporary matrices are needed
auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false;
auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
ab_rotated == false;
// Creates the temporary matrices
auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
program, true, ab_rotated, false);
if (ErrorIn(status)) { return status; }
}
if (!b_no_temp) {
status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
program, true, ab_rotated, false);
if (ErrorIn(status)) { return status; }
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
program, true, c_rotated, false);
if (ErrorIn(status)) { return status; }
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, beta);
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
auto one = static_cast<T>(1);
kernel.SetArgument(3, one);
kernel.SetArgument(4, b_temp());
kernel.SetArgument(5, a_temp());
// Runs the kernel again
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
program, false, c_rotated, false, upper, lower, false);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
// =================================================================================================
// Compiles the templated class
template class Xsyr2k<float>;
template class Xsyr2k<double>;
template class Xsyr2k<float2>;
template class Xsyr2k<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,163 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyrk class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level3/xsyrk.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/copy.opencl"
#include "../../kernels/pad.opencl"
#include "../../kernels/transpose.opencl"
#include "../../kernels/padtranspose.opencl"
#include "../../kernels/xgemm.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
// Computes whether or not the matrices are transposed in memory. This is based on their layout
// (row or column-major) and whether or not they are requested to be pre-transposed.
auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
(layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
auto c_rotated = (layout == Layout::kRowMajor);
// Computes the first and second dimensions of the A matrix taking the layout into account
auto a_one = (a_rotated) ? k : n;
auto a_two = (a_rotated) ? n : k;
// Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
// space. Also tests that the leading dimensions of:
// matrix A cannot be less than N when rotated, or less than K when not-rotated
// matrix C cannot be less than N
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
// Calculates the ceiled versions of n and k
auto n_ceiled = Ceil(n, db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
try {
// Loads the program from the database
auto& program = GetProgramFromCache();
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false;
// Creates the temporary matrices
auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
program, true, a_rotated, false);
if (ErrorIn(status)) { return status; }
}
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
// modify the other triangle.
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
program, true, c_rotated, false);
if (ErrorIn(status)) { return status; }
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, beta);
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, c_temp());
// Computes the global and local thread sizes
auto global = std::vector<size_t>{
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
};
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Runs the post-processing kernel
auto upper = (triangle == Triangle::kUpper);
auto lower = (triangle == Triangle::kLower);
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
program, false, c_rotated, false, upper, lower, false);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
// =================================================================================================
// Compiles the templated class
template class Xsyrk<float>;
template class Xsyrk<double>;
template class Xsyrk<float2>;
template class Xsyrk<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,135 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xtrmm class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level3/xtrmm.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event):
Xgemm<T>(queue, event) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
// Computes the k dimension. This is based on whether or not matrix is A (on the left)
// or B (on the right) in the Xgemm routine.
auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the triangular A matrix
auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
// default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
// Determines whether or not the triangular matrix is unit-diagonal
auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
// Temporary buffer for a copy of the triangular matrix
try {
auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
// Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
// routine afterwards
try {
auto& program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Sets the arguments for the triangular-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));
kernel.SetArgument(1, static_cast<int>(a_ld));
kernel.SetArgument(2, static_cast<int>(a_offset));
kernel.SetArgument(3, a_buffer());
kernel.SetArgument(4, static_cast<int>(k));
kernel.SetArgument(5, static_cast<int>(k));
kernel.SetArgument(6, static_cast<int>(0));
kernel.SetArgument(7, temp_triangular());
kernel.SetArgument(8, static_cast<int>(unit_diagonal));
// Uses the common padding kernel's thread configuration. This is allowed, since the
// triangular-to-squared kernel uses the same parameters.
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Runs the regular Xgemm code with either "B := alpha*A*B" or ...
if (side == Side::kLeft) {
status = DoGemm(layout, a_transpose, Transpose::kNo,
m, n, k,
alpha,
temp_triangular, 0, k,
b_buffer, b_offset, b_ld,
static_cast<T>(0.0),
b_buffer, b_offset, b_ld);
}
// ... with "B := alpha*B*A". Note that A and B are now reversed.
else {
status = DoGemm(layout, Transpose::kNo, a_transpose,
m, n, k,
alpha,
b_buffer, b_offset, b_ld,
temp_triangular, 0, k,
static_cast<T>(0.0),
b_buffer, b_offset, b_ld);
// A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
switch(status) {
case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
}
}
// Return the status of the Xgemm routine
return status;
} catch (...) { return StatusCode::kInvalidKernel; }
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
}
// =================================================================================================
// Compiles the templated class
template class Xtrmm<float>;
template class Xtrmm<double>;
template class Xtrmm<float2>;
template class Xtrmm<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -30,11 +30,10 @@ void CopyTune(const Arguments<T> &args,
// This points to the CopyMatrix kernel as found in the CLBlast library. This is just one example
// of a copy kernel. However, all copy-kernels use the same tuning parameters, so one has to be
// chosen as a representative.
std::string common_source =
#include "../src/kernels/common.opencl"
std::string kernel_source =
#include "../src/kernels/copy.opencl"
auto sources = common_source + kernel_source;
std::string sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/copy.opencl"
;
auto id = tuner.AddKernelFromString(sources, "CopyMatrix", {args.m, args.n}, {1, 1});
tuner.SetReferenceFromString(sources, "CopyMatrix", {args.m, args.n}, {8, 8});

View file

@ -30,11 +30,10 @@ void PadTune(const Arguments<T> &args,
// This points to the PadMatrix kernel as found in the CLBlast library. This is just one
// example of a pad kernel. However, all pad-kernels use the same tuning parameters, so one has
// to be chosen as a representative.
std::string common_source =
#include "../src/kernels/common.opencl"
std::string kernel_source =
#include "../src/kernels/pad.opencl"
auto sources = common_source + kernel_source;
std::string sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/pad.opencl"
;
auto id = tuner.AddKernelFromString(sources, "PadMatrix", {args.m, args.n}, {1, 1});
tuner.SetReferenceFromString(sources, "PadMatrix", {args.m, args.n}, {8, 8});

View file

@ -30,11 +30,10 @@ void PadTransposeTune(const Arguments<T> &args,
// This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
// example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
// to be chosen as a representative.
std::string common_source =
#include "../src/kernels/common.opencl"
std::string kernel_source =
#include "../src/kernels/padtranspose.opencl"
auto sources = common_source + kernel_source;
std::string sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/padtranspose.opencl"
;
auto id = tuner.AddKernelFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {1, 1});
tuner.SetReferenceFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {8, 8});

View file

@ -30,11 +30,10 @@ void TransposeTune(const Arguments<T> &args,
// This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
// example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
// to be chosen as a representative.
std::string common_source =
#include "../src/kernels/common.opencl"
std::string kernel_source =
#include "../src/kernels/transpose.opencl"
auto sources = common_source + kernel_source;
std::string sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/transpose.opencl"
;
auto id = tuner.AddKernelFromString(sources, "TransposeMatrix", {args.m, args.n}, {1, 1});
tuner.SetReferenceFromString(sources, "TransposeMatrix", {args.m, args.n}, {8, 8});
@ -42,6 +41,7 @@ void TransposeTune(const Arguments<T> &args,
tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
tuner.AddParameter(id, "TRA_PAD", {0, 1});
tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1});
// Tests for a specific precision
tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});

View file

@ -34,11 +34,10 @@ void XaxpyTune(const Arguments<T> &args,
}
// This points to the XaxpyFast kernel as found in the CLBlast library
std::string common_source =
#include "../src/kernels/common.opencl"
std::string kernel_source =
#include "../src/kernels/xaxpy.opencl"
auto sources = common_source + kernel_source;
std::string sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/xaxpy.opencl"
;
auto id = tuner.AddKernelFromString(sources, "XaxpyFast", {args.n}, {1});
tuner.SetReferenceFromString(sources, "XaxpyFast", {args.n}, {64});

View file

@ -30,11 +30,10 @@ void XgemmTune(const Arguments<T> &args,
cltune::Tuner &tuner) {
// This points to the Xgemm kernel as found in the CLBlast library and its golden reference
std::string common_source =
#include "../src/kernels/common.opencl"
std::string kernel_source =
#include "../src/kernels/xgemm.opencl"
auto sources = common_source + kernel_source;
std::string sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/xgemm.opencl"
;
auto id = tuner.AddKernelFromString(sources, "Xgemm", {args.m, args.n}, {1, 1});
tuner.SetReferenceFromString(sources, "Xgemm", {args.m, args.n}, {8, 8});

View file

@ -36,11 +36,10 @@ void XgemvTune(const Arguments<T> &args, const size_t variation,
auto a_rotated = (variation == 3) ? 1 : 0;
// This points to the Xgemv kernel as found in the CLBlast library
std::string common_source =
#include "../src/kernels/common.opencl"
std::string kernel_source =
#include "../src/kernels/xgemv.opencl"
auto sources = common_source + kernel_source;
std::string sources =
#include "../src/kernels/common.opencl"
#include "../src/kernels/xgemv.opencl"
;
auto id = tuner.AddKernelFromString(sources, kernel_name, {args.m}, {1});
tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});

View file

@ -79,6 +79,13 @@ std::string ToString(Triangle value) {
}
}
template <>
std::string ToString(Diagonal value) {
switch(value) {
case Diagonal::kUnit: return ToString(static_cast<int>(value))+" (unit)";
case Diagonal::kNonUnit: return ToString(static_cast<int>(value))+" (non-unit)";
}
}
template <>
std::string ToString(Precision value) {
switch(value) {
case Precision::kHalf: return ToString(static_cast<int>(value))+" (half)";
@ -143,6 +150,7 @@ template Layout GetArgument<Layout>(const int, char **, std::string&, const std:
template Transpose GetArgument<Transpose>(const int, char **, std::string&, const std::string&, const Transpose);
template Side GetArgument<Side>(const int, char **, std::string&, const std::string&, const Side);
template Triangle GetArgument<Triangle>(const int, char **, std::string&, const std::string&, const Triangle);
template Diagonal GetArgument<Diagonal>(const int, char **, std::string&, const std::string&, const Diagonal);
template Precision GetArgument<Precision>(const int, char **, std::string&, const std::string&, const Precision);
// =================================================================================================

View file

@ -0,0 +1,81 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xaxpy routine.
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level1/xaxpy.h"
namespace clblast {
// =================================================================================================
// The correctness tester
template <typename T>
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates a tester
TestBlas<T,T> tester{argc, argv, silent, name, TestXaxpy<T>::GetOptions(),
TestXaxpy<T>::RunRoutine, TestXaxpy<T>::RunReference,
TestXaxpy<T>::DownloadResult, TestXaxpy<T>::GetResultIndex,
TestXaxpy<T>::ResultID1, TestXaxpy<T>::ResultID2};
// This variable holds the arguments relevant for this routine
auto args = Arguments<T>{};
// Creates the arguments vector for the regular tests
auto regular_test_vector = std::vector<Arguments<T>>{};
for (auto &n: tester.kVectorDims) { args.n = n;
for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
args.x_size = TestXaxpy<T>::GetSizeX(args);
args.y_size = TestXaxpy<T>::GetSizeY(args);
if (args.x_size<1 || args.y_size<1) { continue; }
regular_test_vector.push_back(args);
}
}
}
}
}
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<T>>{};
args.n = tester.kBufferSize;
args.x_inc = args.y_inc = 1;
args.x_offset = args.y_offset = 0;
for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
invalid_test_vector.push_back(args);
}
}
// Runs the tests
const auto case_name = "default";
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTest<float>(argc, argv, false, "SAXPY");
clblast::RunTest<double>(argc, argv, true, "DAXPY");
clblast::RunTest<clblast::float2>(argc, argv, true, "CAXPY");
clblast::RunTest<clblast::double2>(argc, argv, true, "ZAXPY");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,99 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xgemv routine.
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level2/xgemv.h"
namespace clblast {
// =================================================================================================
// The correctness tester
template <typename T>
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates a tester
TestBlas<T,T> tester{argc, argv, silent, name, TestXgemv<T>::GetOptions(),
TestXgemv<T>::RunRoutine, TestXgemv<T>::RunReference,
TestXgemv<T>::DownloadResult, TestXgemv<T>::GetResultIndex,
TestXgemv<T>::ResultID1, TestXgemv<T>::ResultID2};
// This variable holds the arguments relevant for this routine
auto args = Arguments<T>{};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) { args.layout = layout;
for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
// Creates the arguments vector for the regular tests
auto regular_test_vector = std::vector<Arguments<T>>{};
for (auto &m: tester.kMatrixVectorDims) { args.m = m;
for (auto &n: tester.kMatrixVectorDims) { args.n = n;
for (auto &a_ld: tester.kMatrixVectorDims) { args.a_ld = a_ld;
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
for (auto &beta: tester.kBetaValues) { args.beta = beta;
args.a_size = TestXgemv<T>::GetSizeA(args);
args.x_size = TestXgemv<T>::GetSizeX(args);
args.y_size = TestXgemv<T>::GetSizeY(args);
if (args.a_size<1 || args.x_size<1 || args.y_size<1) { continue; }
regular_test_vector.push_back(args);
}
}
}
}
}
}
}
}
}
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<T>>{};
args.m = args.n = tester.kBufferSize;
args.a_ld = tester.kBufferSize;
args.x_inc = args.y_inc = 1;
args.a_offset = args.x_offset = args.y_offset = 0;
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
invalid_test_vector.push_back(args);
}
}
}
// Runs the tests
const auto case_name = ToString(layout)+" "+ToString(a_transpose);
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTest<float>(argc, argv, false, "SGEMV");
clblast::RunTest<double>(argc, argv, true, "DGEMV");
clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMV");
clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMV");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,102 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xgemm routine.
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level3/xgemm.h"
namespace clblast {
// =================================================================================================
// The correctness tester
template <typename T>
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates a tester
TestBlas<T,T> tester{argc, argv, silent, name, TestXgemm<T>::GetOptions(),
TestXgemm<T>::RunRoutine, TestXgemm<T>::RunReference,
TestXgemm<T>::DownloadResult, TestXgemm<T>::GetResultIndex,
TestXgemm<T>::ResultID1, TestXgemm<T>::ResultID2};
// This variable holds the arguments relevant for this routine
auto args = Arguments<T>{};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) { args.layout = layout;
for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
for (auto &b_transpose: tester.kTransposes) { args.b_transpose = b_transpose;
// Creates the arguments vector for the regular tests
auto regular_test_vector = std::vector<Arguments<T>>{};
for (auto &m: tester.kMatrixDims) { args.m = m;
for (auto &n: tester.kMatrixDims) { args.n = n;
for (auto &k: tester.kMatrixDims) { args.k = k;
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
for (auto &beta: tester.kBetaValues) { args.beta = beta;
args.a_size = TestXgemm<T>::GetSizeA(args);
args.b_size = TestXgemm<T>::GetSizeB(args);
args.c_size = TestXgemm<T>::GetSizeC(args);
if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
regular_test_vector.push_back(args);
}
}
}
}
}
}
}
}
}
}
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<T>>{};
args.m = args.n = args.k = tester.kBufferSize;
args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
args.a_offset = args.b_offset = args.c_offset = 0;
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
invalid_test_vector.push_back(args);
}
}
}
// Runs the tests
const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTest<float>(argc, argv, false, "SGEMM");
clblast::RunTest<double>(argc, argv, true, "DGEMM");
clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMM");
clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMM");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,98 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xhemm routine.
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level3/xhemm.h"
namespace clblast {
// =================================================================================================
// The correctness tester
template <typename T>
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates a tester
TestBlas<T,T> tester{argc, argv, silent, name, TestXhemm<T>::GetOptions(),
TestXhemm<T>::RunRoutine, TestXhemm<T>::RunReference,
TestXhemm<T>::DownloadResult, TestXhemm<T>::GetResultIndex,
TestXhemm<T>::ResultID1, TestXhemm<T>::ResultID2};
// This variable holds the arguments relevant for this routine
auto args = Arguments<T>{};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) { args.layout = layout;
for (auto &side: tester.kSides) { args.side = side;
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
// Creates the arguments vector for the regular tests
auto regular_test_vector = std::vector<Arguments<T>>{};
for (auto &m: tester.kMatrixDims) { args.m = m;
for (auto &n: tester.kMatrixDims) { args.n = n;
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
for (auto &beta: tester.kBetaValues) { args.beta = beta;
args.a_size = TestXhemm<T>::GetSizeA(args);
args.b_size = TestXhemm<T>::GetSizeB(args);
args.c_size = TestXhemm<T>::GetSizeC(args);
if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
regular_test_vector.push_back(args);
}
}
}
}
}
}
}
}
}
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<T>>{};
args.m = args.n = tester.kBufferSize;
args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
args.a_offset = args.b_offset = args.c_offset = 0;
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
invalid_test_vector.push_back(args);
}
}
}
// Runs the tests
const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTest<clblast::float2>(argc, argv, true, "CHEMM");
clblast::RunTest<clblast::double2>(argc, argv, true, "ZHEMM");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,100 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xher2k routine.
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level3/xher2k.h"
namespace clblast {
// =================================================================================================
// The correctness tester
template <typename T, typename U>
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates a tester
TestBlas<T,U> tester{argc, argv, silent, name, TestXher2k<T,U>::GetOptions(),
TestXher2k<T,U>::RunRoutine, TestXher2k<T,U>::RunReference,
TestXher2k<T,U>::DownloadResult, TestXher2k<T,U>::GetResultIndex,
TestXher2k<T,U>::ResultID1, TestXher2k<T,U>::ResultID2};
// This variable holds the arguments relevant for this routine
auto args = Arguments<U>{};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) { args.layout = layout;
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
for (auto &ab_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
args.a_transpose = ab_transpose; // valid BLAS option
args.b_transpose = ab_transpose;
// Creates the arguments vector for the regular tests
auto regular_test_vector = std::vector<Arguments<U>>{};
for (auto &n: tester.kMatrixDims) { args.n = n;
for (auto &k: tester.kMatrixDims) { args.k = k;
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
for (auto &beta: tester.kBetaValues) { args.beta = beta;
args.a_size = TestXher2k<T,U>::GetSizeA(args);
args.b_size = TestXher2k<T,U>::GetSizeB(args);
args.c_size = TestXher2k<T,U>::GetSizeC(args);
if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
regular_test_vector.push_back(args);
}
}
}
}
}
}
}
}
}
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<U>>{};
args.n = args.k = tester.kBufferSize;
args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
args.a_offset = args.b_offset = args.c_offset = 0;
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
invalid_test_vector.push_back(args);
}
}
}
// Runs the tests
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHER2K");
clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHER2K");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,92 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xherk routine.
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level3/xherk.h"
namespace clblast {
// =================================================================================================
// The correctness tester
template <typename T, typename U>
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates a tester
TestBlas<T,U> tester{argc, argv, silent, name, TestXherk<T,U>::GetOptions(),
TestXherk<T,U>::RunRoutine, TestXherk<T,U>::RunReference,
TestXherk<T,U>::DownloadResult, TestXherk<T,U>::GetResultIndex,
TestXherk<T,U>::ResultID1, TestXherk<T,U>::ResultID2};
// This variable holds the arguments relevant for this routine
auto args = Arguments<U>{};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) { args.layout = layout;
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
for (auto &a_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
args.a_transpose = a_transpose; // valid BLAS option
// Creates the arguments vector for the regular tests
auto regular_test_vector = std::vector<Arguments<U>>{};
for (auto &n: tester.kMatrixDims) { args.n = n;
for (auto &k: tester.kMatrixDims) { args.k = k;
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
for (auto &beta: tester.kBetaValues) { args.beta = beta;
args.a_size = TestXherk<T,U>::GetSizeA(args);
args.c_size = TestXherk<T,U>::GetSizeC(args);
if (args.a_size<1 || args.c_size<1) { continue; }
regular_test_vector.push_back(args);
}
}
}
}
}
}
}
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<U>>{};
args.n = args.k = tester.kBufferSize;
args.a_ld = args.c_ld = tester.kBufferSize;
args.a_offset = args.c_offset = 0;
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
invalid_test_vector.push_back(args);
}
}
// Runs the tests
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHERK");
clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHERK");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,100 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xsymm routine.
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level3/xsymm.h"
namespace clblast {
// =================================================================================================
// The correctness tester
template <typename T>
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates a tester
TestBlas<T,T> tester{argc, argv, silent, name, TestXsymm<T>::GetOptions(),
TestXsymm<T>::RunRoutine, TestXsymm<T>::RunReference,
TestXsymm<T>::DownloadResult, TestXsymm<T>::GetResultIndex,
TestXsymm<T>::ResultID1, TestXsymm<T>::ResultID2};
// This variable holds the arguments relevant for this routine
auto args = Arguments<T>{};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) { args.layout = layout;
for (auto &side: tester.kSides) { args.side = side;
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
// Creates the arguments vector for the regular tests
auto regular_test_vector = std::vector<Arguments<T>>{};
for (auto &m: tester.kMatrixDims) { args.m = m;
for (auto &n: tester.kMatrixDims) { args.n = n;
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
for (auto &beta: tester.kBetaValues) { args.beta = beta;
args.a_size = TestXsymm<T>::GetSizeA(args);
args.b_size = TestXsymm<T>::GetSizeB(args);
args.c_size = TestXsymm<T>::GetSizeC(args);
if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
regular_test_vector.push_back(args);
}
}
}
}
}
}
}
}
}
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<T>>{};
args.m = args.n = tester.kBufferSize;
args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
args.a_offset = args.b_offset = args.c_offset = 0;
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
invalid_test_vector.push_back(args);
}
}
}
// Runs the tests
const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTest<float>(argc, argv, false, "SSYMM");
clblast::RunTest<double>(argc, argv, true, "DSYMM");
clblast::RunTest<clblast::float2>(argc, argv, true, "CSYMM");
clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYMM");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,102 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xsyr2k routine.
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level3/xsyr2k.h"
namespace clblast {
// =================================================================================================
// The correctness tester
template <typename T>
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates a tester
TestBlas<T,T> tester{argc, argv, silent, name, TestXsyr2k<T>::GetOptions(),
TestXsyr2k<T>::RunRoutine, TestXsyr2k<T>::RunReference,
TestXsyr2k<T>::DownloadResult, TestXsyr2k<T>::GetResultIndex,
TestXsyr2k<T>::ResultID1, TestXsyr2k<T>::ResultID2};
// This variable holds the arguments relevant for this routine
auto args = Arguments<T>{};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) { args.layout = layout;
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
args.a_transpose = ab_transpose; // is not supported by clBLAS
args.b_transpose = ab_transpose;
// Creates the arguments vector for the regular tests
auto regular_test_vector = std::vector<Arguments<T>>{};
for (auto &n: tester.kMatrixDims) { args.n = n;
for (auto &k: tester.kMatrixDims) { args.k = k;
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
for (auto &beta: tester.kBetaValues) { args.beta = beta;
args.a_size = TestXsyr2k<T>::GetSizeA(args);
args.b_size = TestXsyr2k<T>::GetSizeB(args);
args.c_size = TestXsyr2k<T>::GetSizeC(args);
if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
regular_test_vector.push_back(args);
}
}
}
}
}
}
}
}
}
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<T>>{};
args.n = args.k = tester.kBufferSize;
args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
args.a_offset = args.b_offset = args.c_offset = 0;
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
invalid_test_vector.push_back(args);
}
}
}
// Runs the tests
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTest<float>(argc, argv, false, "SSYR2K");
clblast::RunTest<double>(argc, argv, true, "DSYR2K");
clblast::RunTest<clblast::float2>(argc, argv, true, "CSYR2K");
clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYR2K");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,94 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xsyrk routine.
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level3/xsyrk.h"
namespace clblast {
// =================================================================================================
// The correctness tester
template <typename T>
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates a tester
TestBlas<T,T> tester{argc, argv, silent, name, TestXsyrk<T>::GetOptions(),
TestXsyrk<T>::RunRoutine, TestXsyrk<T>::RunReference,
TestXsyrk<T>::DownloadResult, TestXsyrk<T>::GetResultIndex,
TestXsyrk<T>::ResultID1, TestXsyrk<T>::ResultID2};
// This variable holds the arguments relevant for this routine
auto args = Arguments<T>{};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) { args.layout = layout;
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
args.a_transpose = a_transpose; // is not supported by clBLAS
// Creates the arguments vector for the regular tests
auto regular_test_vector = std::vector<Arguments<T>>{};
for (auto &n: tester.kMatrixDims) { args.n = n;
for (auto &k: tester.kMatrixDims) { args.k = k;
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
for (auto &beta: tester.kBetaValues) { args.beta = beta;
args.a_size = TestXsyrk<T>::GetSizeA(args);
args.c_size = TestXsyrk<T>::GetSizeC(args);
if (args.a_size<1 || args.c_size<1) { continue; }
regular_test_vector.push_back(args);
}
}
}
}
}
}
}
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<T>>{};
args.n = args.k = tester.kBufferSize;
args.a_ld = args.c_ld = tester.kBufferSize;
args.a_offset = args.c_offset = 0;
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
invalid_test_vector.push_back(args);
}
}
// Runs the tests
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTest<float>(argc, argv, false, "SSYRK");
clblast::RunTest<double>(argc, argv, true, "DSYRK");
clblast::RunTest<clblast::float2>(argc, argv, true, "CSYRK");
clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYRK");
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,96 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xtrmm routine.
//
// =================================================================================================
#include "correctness/testblas.h"
#include "routines/level3/xtrmm.h"
namespace clblast {
// =================================================================================================
// The correctness tester
template <typename T>
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates a tester
TestBlas<T,T> tester{argc, argv, silent, name, TestXtrmm<T>::GetOptions(),
TestXtrmm<T>::RunRoutine, TestXtrmm<T>::RunReference,
TestXtrmm<T>::DownloadResult, TestXtrmm<T>::GetResultIndex,
TestXtrmm<T>::ResultID1, TestXtrmm<T>::ResultID2};
// This variable holds the arguments relevant for this routine
auto args = Arguments<T>{};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) { args.layout = layout;
for (auto &side: tester.kSides) { args.side = side;
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
for (auto &diagonal: tester.kDiagonals) { args.diagonal = diagonal;
// Creates the arguments vector for the regular tests
auto regular_test_vector = std::vector<Arguments<T>>{};
for (auto &m: tester.kMatrixDims) { args.m = m;
for (auto &n: tester.kMatrixDims) { args.n = n;
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
args.a_size = TestXtrmm<T>::GetSizeA(args);
args.b_size = TestXtrmm<T>::GetSizeB(args);
if (args.a_size<1 || args.b_size<1) { continue; }
regular_test_vector.push_back(args);
}
}
}
}
}
}
}
// Creates the arguments vector for the invalid-buffer tests
auto invalid_test_vector = std::vector<Arguments<T>>{};
args.m = args.n = tester.kBufferSize;
args.a_ld = args.b_ld = tester.kBufferSize;
args.a_offset = args.b_offset = 0;
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
invalid_test_vector.push_back(args);
}
}
// Runs the tests
const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle)+" "+
ToString(a_transpose)+" "+ToString(diagonal);
tester.TestRegular(regular_test_vector, case_name);
tester.TestInvalid(invalid_test_vector, case_name);
}
}
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::RunTest<float>(argc, argv, false, "STRMM");
clblast::RunTest<double>(argc, argv, true, "DTRMM");
clblast::RunTest<clblast::float2>(argc, argv, true, "CTRMM");
clblast::RunTest<clblast::double2>(argc, argv, true, "ZTRMM");
return 0;
}
// =================================================================================================

View file

@ -1,75 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xaxpy routine. It is based on the TestXY class.
//
// =================================================================================================
#include "wrapper_clblas.h"
#include "correctness/testxy.h"
namespace clblast {
// =================================================================================================
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
template <typename T>
void XaxpyTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates the CLBlast lambda
auto clblast_lambda = [](const Arguments<T> &args,
const Buffer &x_vec, const Buffer &y_vec,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
return Axpy(args.n, args.alpha,
x_vec(), args.x_offset, args.x_inc,
y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
};
// Creates the clBLAS lambda (for comparison)
auto clblas_lambda = [](const Arguments<T> &args,
const Buffer &x_vec, const Buffer &y_vec,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXaxpy(args.n, args.alpha,
x_vec(), args.x_offset, args.x_inc,
y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
return static_cast<StatusCode>(status);
};
// Initializes the arguments relevant for this routine
auto args = Arguments<T>{};
const auto options = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
kArgXOffset, kArgYOffset, kArgAlpha};
// Creates a tester
TestXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
// Runs the tests
const auto case_name = "default";
tester.TestRegular(args, case_name);
tester.TestInvalidBufferSizes(args, case_name);
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::XaxpyTest<float>(argc, argv, false, "SAXPY");
clblast::XaxpyTest<double>(argc, argv, true, "DAXPY");
clblast::XaxpyTest<clblast::float2>(argc, argv, true, "CAXPY");
clblast::XaxpyTest<clblast::double2>(argc, argv, true, "ZAXPY");
return 0;
}
// =================================================================================================

View file

@ -1,98 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xgemm routine. It is based on the TestABC class.
//
// =================================================================================================
#include "wrapper_clblas.h"
#include "correctness/testabc.h"
namespace clblast {
// =================================================================================================
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
template <typename T>
void XgemmTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates the CLBlast lambda
auto clblast_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
return Gemm(args.layout, args.a_transpose, args.b_transpose,
args.m, args.n, args.k,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
b_mat(), args.b_offset, args.b_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
};
// Creates the clBLAS lambda (for comparison)
auto clblas_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
static_cast<clblasTranspose>(args.a_transpose),
static_cast<clblasTranspose>(args.b_transpose),
args.m, args.n, args.k,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
b_mat(), args.b_offset, args.b_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
return static_cast<StatusCode>(status);
};
// Initializes the arguments relevant for this routine
auto args = Arguments<T>{};
const auto options = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
kArgATransp, kArgBTransp,
kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
kArgAOffset, kArgBOffset, kArgCOffset};
// Creates a tester
TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) {
args.layout = layout;
for (auto &a_transpose: tester.kTransposes) {
args.a_transpose = a_transpose;
for (auto &b_transpose: tester.kTransposes) {
args.b_transpose = b_transpose;
const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
// Runs the tests
tester.TestRegular(args, case_name);
tester.TestInvalidBufferSizes(args, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::XgemmTest<float>(argc, argv, false, "SGEMM");
clblast::XgemmTest<double>(argc, argv, true, "DGEMM");
clblast::XgemmTest<clblast::float2>(argc, argv, true, "CGEMM");
clblast::XgemmTest<clblast::double2>(argc, argv, true, "ZGEMM");
return 0;
}
// =================================================================================================

View file

@ -1,88 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xgemv routine. It is based on the TestAXY class.
//
// =================================================================================================
#include "wrapper_clblas.h"
#include "correctness/testaxy.h"
namespace clblast {
// =================================================================================================
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
template <typename T>
void XgemvTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates the CLBlast lambda
auto clblast_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
return Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha,
a_mat(), args.a_offset, args.a_ld,
x_vec(), args.x_offset, args.x_inc, args.beta,
y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
};
// Creates the clBLAS lambda (for comparison)
auto clblas_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
static_cast<clblasTranspose>(args.a_transpose),
args.m, args.n, args.alpha,
a_mat(), args.a_offset, args.a_ld,
x_vec(), args.x_offset, args.x_inc, args.beta,
y_vec(), args.y_offset, args.y_inc,
1, &queue_plain, 0, nullptr, &event);
return static_cast<StatusCode>(status);
};
// Initializes the arguments relevant for this routine
auto args = Arguments<T>{};
const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout, kArgATransp,
kArgALeadDim, kArgXInc, kArgYInc,
kArgAOffset, kArgXOffset, kArgYOffset};
// Creates a tester
TestAXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) {
args.layout = layout;
for (auto &a_transpose: tester.kTransposes) {
args.a_transpose = a_transpose;
const auto case_name = ToString(layout)+" "+ToString(a_transpose);
// Runs the tests
tester.TestRegular(args, case_name);
tester.TestInvalidBufferSizes(args, case_name);
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::XgemvTest<float>(argc, argv, false, "SGEMV");
clblast::XgemvTest<double>(argc, argv, true, "DGEMV");
clblast::XgemvTest<clblast::float2>(argc, argv, true, "CGEMV");
clblast::XgemvTest<clblast::double2>(argc, argv, true, "ZGEMV");
return 0;
}
// =================================================================================================

View file

@ -1,98 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the Xsymm routine. It is based on the TestABC class.
//
// =================================================================================================
#include "wrapper_clblas.h"
#include "correctness/testabc.h"
namespace clblast {
// =================================================================================================
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
template <typename T>
void XsymmTest(int argc, char *argv[], const bool silent, const std::string &name) {
// Creates the CLBlast lambda
auto clblast_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
return Symm(args.layout, args.side, args.triangle,
args.m, args.n,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
b_mat(), args.b_offset, args.b_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
&queue_plain, &event);
};
// Creates the clBLAS lambda (for comparison)
auto clblas_lambda = [](const Arguments<T> &args,
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
CommandQueue &queue) -> StatusCode {
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
static_cast<clblasSide>(args.side),
static_cast<clblasUplo>(args.triangle),
args.m, args.n,
args.alpha,
a_mat(), args.a_offset, args.a_ld,
b_mat(), args.b_offset, args.b_ld,
args.beta,
c_mat(), args.c_offset, args.c_ld,
1, &queue_plain, 0, nullptr, &event);
return static_cast<StatusCode>(status);
};
// Initializes the arguments relevant for this routine
auto args = Arguments<T>{};
const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout,
kArgSide, kArgTriangle,
kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
kArgAOffset, kArgBOffset, kArgCOffset};
// Creates a tester
TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
// Loops over the test-cases from a data-layout point of view
for (auto &layout: tester.kLayouts) {
args.layout = layout;
for (auto &side: {Side::kLeft, Side::kRight}) {
args.side = side;
for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
args.triangle = triangle;
const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
// Runs the tests
tester.TestRegular(args, case_name);
tester.TestInvalidBufferSizes(args, case_name);
}
}
}
}
// =================================================================================================
} // namespace clblast
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
clblast::XsymmTest<float>(argc, argv, false, "SSYMM");
clblast::XsymmTest<double>(argc, argv, true, "DSYMM");
clblast::XsymmTest<clblast::float2>(argc, argv, true, "CSYMM");
clblast::XsymmTest<clblast::double2>(argc, argv, true, "ZSYMM");
return 0;
}
// =================================================================================================

View file

@ -1,217 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the TestABC class (see the header for information about the class).
//
// =================================================================================================
#include <algorithm>
#include "correctness/testabc.h"
namespace clblast {
// =================================================================================================
// Constructor, initializes the base class tester and input data
template <typename T>
TestABC<T>::TestABC(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda):
Tester<T>{argc, argv, silent, name, options},
clblast_lambda_(clblast_lambda),
clblas_lambda_(clblas_lambda) {
// Computes the maximum sizes. This allows for a single set of input/output buffers.
auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
// Creates test input data
a_source_.resize(max_dim*max_ld + max_offset);
b_source_.resize(max_dim*max_ld + max_offset);
c_source_.resize(max_dim*max_ld + max_offset);
PopulateVector(a_source_);
PopulateVector(b_source_);
PopulateVector(c_source_);
}
// ===============================================================================================
// Tests the routine for a wide variety of parameters
template <typename T>
void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("regular behaviour", name);
// Computes whether or not the matrices are transposed. Note that we assume a default of
// column-major and no-transpose. If one of them is different (but not both), then rotated
// is considered true.
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
(args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
auto c_rotated = (args.layout == Layout::kRowMajor);
// Iterates over the matrix dimensions
for (auto &m: kMatrixDims) {
args.m = m;
for (auto &n: kMatrixDims) {
args.n = n;
for (auto &k: kMatrixDims) {
args.k = k;
// Computes the second dimensions of the matrices taking the rotation into account
auto a_two = (a_rotated) ? m : k;
auto b_two = (b_rotated) ? k : n;
auto c_two = (c_rotated) ? m : n;
// Iterates over the leading-dimension values and the offsets
for (auto &a_ld: kMatrixDims) {
args.a_ld = a_ld;
for (auto &a_offset: kOffsets) {
args.a_offset = a_offset;
for (auto &b_ld: kMatrixDims) {
args.b_ld = b_ld;
for (auto &b_offset: kOffsets) {
args.b_offset = b_offset;
for (auto &c_ld: kMatrixDims) {
args.c_ld = c_ld;
for (auto &c_offset: kOffsets) {
args.c_offset = c_offset;
// Computes the buffer sizes
auto a_size = a_two * a_ld + a_offset;
auto b_size = b_two * b_ld + b_offset;
auto c_size = c_two * c_ld + c_offset;
if (a_size < 1 || b_size < 1 || c_size < 1) { continue; }
// Creates the OpenCL buffers
auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
auto b_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
// Iterates over the values for alpha and beta
for (auto &alpha: kAlphaValues) {
args.alpha = alpha;
for (auto &beta: kBetaValues) {
args.beta = beta;
// Runs the reference clBLAS code
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
// Runs the CLBlast code
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
// Tests for equality of the two status codes
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
TestErrorCodes(status1, status2, args);
continue;
}
// Downloads the results
std::vector<T> r_result(c_size, static_cast<T>(0));
std::vector<T> s_result(c_size, static_cast<T>(0));
r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
// Checks for differences in the output
auto errors = size_t{0};
for (auto idm=size_t{0}; idm<m; ++idm) {
for (auto idn=size_t{0}; idn<n; ++idn) {
auto index = (args.layout == Layout::kRowMajor) ?
idm*args.c_ld + idn + args.c_offset:
idn*args.c_ld + idm + args.c_offset;
if (!TestSimilarity(r_result[index], s_result[index])) {
errors++;
}
}
}
// Tests the error count (should be zero)
TestErrorCount(errors, m*n, args);
}
}
}
}
}
}
}
}
}
}
}
TestEnd();
}
// =================================================================================================
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
// does not test for results (if any).
template <typename T>
void TestABC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("invalid buffer sizes", name);
// Sets example test parameters
args.m = kBufferSize;
args.n = kBufferSize;
args.k = kBufferSize;
args.a_ld = kBufferSize;
args.b_ld = kBufferSize;
args.c_ld = kBufferSize;
args.a_offset = 0;
args.b_offset = 0;
args.c_offset = 0;
// Iterates over test buffer sizes
const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
for (auto &a_size: kBufferSizes) {
for (auto &b_size: kBufferSizes) {
for (auto &c_size: kBufferSizes) {
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
// want to be able to create invalid buffers (no error checking here).
auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
auto a_mat = Buffer(a);
auto b = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
auto b_mat = Buffer(b);
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
auto r_mat = Buffer(r);
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
auto s_mat = Buffer(s);
// Runs the two routines
auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
// Tests for equality of the two status codes
TestErrorCodes(status1, status2, args);
}
}
}
TestEnd();
}
// =================================================================================================
// Compiles the templated class
template class TestABC<float>;
template class TestABC<double>;
template class TestABC<float2>;
template class TestABC<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -1,86 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file tests any mat-mat-mat (A,B,C) routine. It contains two types of tests: one testing
// all sorts of input combinations, and one deliberatly testing with invalid values.
//
// =================================================================================================
#ifndef CLBLAST_TEST_CORRECTNESS_TESTABC_H_
#define CLBLAST_TEST_CORRECTNESS_TESTABC_H_
#include <vector>
#include <string>
#include "correctness/tester.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestABC: public Tester<T> {
public:
// Uses several variables from the Tester class
using Tester<T>::context_;
using Tester<T>::queue_;
using Tester<T>::kLayouts;
using Tester<T>::kTransposes;
// Uses several helper functions from the Tester class
using Tester<T>::TestStart;
using Tester<T>::TestEnd;
using Tester<T>::TestSimilarity;
using Tester<T>::TestErrorCount;
using Tester<T>::TestErrorCodes;
using Tester<T>::GetExampleScalars;
using Tester<T>::GetOffsets;
using Tester<T>::PrecisionSupported;
// Test settings for the regular test. Append to this list in case more tests are required.
const std::vector<size_t> kMatrixDims = { 7, 64 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<T> kAlphaValues = GetExampleScalars();
const std::vector<T> kBetaValues = GetExampleScalars();
// Test settings for the invalid test
const size_t kBufferSize = 64;
// Shorthand for a BLAS routine
using Routine = std::function<StatusCode(const Arguments<T>&,
const Buffer&, const Buffer&, const Buffer&,
CommandQueue&)>;
// Constructor, initializes the base class tester and input data
TestABC(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda);
// The test functions, taking no inputs
void TestRegular(Arguments<T> &args, const std::string &name);
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
private:
// Source data to test with
std::vector<T> a_source_;
std::vector<T> b_source_;
std::vector<T> c_source_;
// The routines to test
Routine clblast_lambda_;
Routine clblas_lambda_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_CORRECTNESS_TESTABC_H_
#endif

View file

@ -1,213 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the TestAXY class (see the header for information about the class).
//
// =================================================================================================
#include <algorithm>
#include "correctness/testaxy.h"
namespace clblast {
// =================================================================================================
// Constructor, initializes the base class tester and input data
template <typename T>
TestAXY<T>::TestAXY(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda):
Tester<T>{argc, argv, silent, name, options},
clblast_lambda_(clblast_lambda),
clblas_lambda_(clblas_lambda) {
// Computes the maximum sizes. This allows for a single set of input/output buffers.
auto max_dim = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
auto max_ld = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
// Creates test input data
a_source_.resize(max_dim*max_ld + max_offset);
x_source_.resize(max_dim*max_inc + max_offset);
y_source_.resize(max_dim*max_inc + max_offset);
PopulateVector(a_source_);
PopulateVector(x_source_);
PopulateVector(y_source_);
}
// ===============================================================================================
// Tests the routine for a wide variety of parameters
template <typename T>
void TestAXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("regular behaviour", name);
// Iterates over the dimension for the matrix and vectors
for (auto &m: kMatrixVectorDims) {
args.m = m;
for (auto &n: kMatrixVectorDims) {
args.n = n;
// Computes the second dimension of the matrix taking the rotation into account
auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
// Computes the vector sizes in case the matrix is transposed
auto a_transposed = (args.a_transpose == Transpose::kYes);
auto m_real = (a_transposed) ? n : m;
auto n_real = (a_transposed) ? m : n;
// Iterates over the leading-dimension values and the offsets of the matrix
for (auto &a_ld: kMatrixVectorDims) {
args.a_ld = a_ld;
for (auto &a_offset: kOffsets) {
args.a_offset = a_offset;
// Iterates over the increment-values and the offsets of the vectors
for (auto &x_inc: kIncrements) {
args.x_inc = x_inc;
for (auto &x_offset: kOffsets) {
args.x_offset = x_offset;
for (auto &y_inc: kIncrements) {
args.y_inc = y_inc;
for (auto &y_offset: kOffsets) {
args.y_offset = y_offset;
// Computes the buffer sizes
auto a_size = a_two * a_ld + a_offset;
auto x_size = n_real * x_inc + x_offset;
auto y_size = m_real * y_inc + y_offset;
if (a_size < 1 || x_size < 1 || y_size < 1) { continue; }
// Creates the OpenCL buffers
auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
// Iterates over the values for alpha and beta
for (auto &alpha: kAlphaValues) {
args.alpha = alpha;
for (auto &beta: kBetaValues) {
args.beta = beta;
// Runs the reference clBLAS code
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
// Runs the CLBlast code
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
// Tests for equality of the two status codes
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
TestErrorCodes(status1, status2, args);
continue;
}
// Downloads the results
std::vector<T> r_result(y_size, static_cast<T>(0));
std::vector<T> s_result(y_size, static_cast<T>(0));
r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
// Checks for differences in the output
auto errors = size_t{0};
for (auto idm=size_t{0}; idm<m_real; ++idm) {
auto index = idm*y_inc + y_offset;
if (!TestSimilarity(r_result[index], s_result[index])) {
errors++;
}
}
// Tests the error count (should be zero)
TestErrorCount(errors, m_real, args);
}
}
}
}
}
}
}
}
}
}
TestEnd();
}
// =================================================================================================
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
// does not test for results (if any).
template <typename T>
void TestAXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("invalid buffer sizes", name);
// Sets example test parameters
args.m = kBufferSize;
args.n = kBufferSize;
args.a_ld = kBufferSize;
args.a_offset = 0;
args.x_offset = 0;
args.y_offset = 0;
// Iterates over test buffer sizes
const std::vector<size_t> kMatrixSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
const std::vector<size_t> kVectorSizes = {0, kBufferSize - 1, kBufferSize};
for (auto &a_size: kMatrixSizes) {
for (auto &x_size: kVectorSizes) {
for (auto &y_size: kVectorSizes) {
// Iterates over test increments
for (auto &x_inc: kInvalidIncrements) {
args.x_inc = x_inc;
for (auto &y_inc: kInvalidIncrements) {
args.y_inc = y_inc;
// Creates the OpenCL buffers. Note: we are not using the C++ version since we
// explicitly want to be able to create invalid buffers (no error checking here).
auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
auto a_mat = Buffer(a);
auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
auto x_vec = Buffer(x);
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
auto r_vec = Buffer(r);
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
auto s_vec = Buffer(s);
// Runs the two routines
auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
// Tests for equality of the two status codes
TestErrorCodes(status1, status2, args);
}
}
}
}
}
TestEnd();
}
// =================================================================================================
// Compiles the templated class
template class TestAXY<float>;
template class TestAXY<double>;
template class TestAXY<float2>;
template class TestAXY<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -1,88 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file tests any mat-vec-vec (A,X,Y) routine. It contains two types of tests: one testing
// all sorts of input combinations, and one deliberatly testing with invalid values.
//
// =================================================================================================
#ifndef CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
#define CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
#include <vector>
#include <string>
#include "correctness/tester.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestAXY: public Tester<T> {
public:
// Uses several variables from the Tester class
using Tester<T>::context_;
using Tester<T>::queue_;
using Tester<T>::kLayouts;
using Tester<T>::kTransposes;
// Uses several helper functions from the Tester class
using Tester<T>::TestStart;
using Tester<T>::TestEnd;
using Tester<T>::TestSimilarity;
using Tester<T>::TestErrorCount;
using Tester<T>::TestErrorCodes;
using Tester<T>::GetExampleScalars;
using Tester<T>::GetOffsets;
using Tester<T>::PrecisionSupported;
// Test settings for the regular test. Append to this list in case more tests are required.
const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<size_t> kIncrements = { 1, 2 };
const std::vector<T> kAlphaValues = GetExampleScalars();
const std::vector<T> kBetaValues = GetExampleScalars();
// Test settings for the invalid test
const std::vector<size_t> kInvalidIncrements = { 0, 1 };
const size_t kBufferSize = 64;
// Shorthand for a BLAS routine
using Routine = std::function<StatusCode(const Arguments<T>&,
const Buffer&, const Buffer&, const Buffer&,
CommandQueue&)>;
// Constructor, initializes the base class tester and input data
TestAXY(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda);
// The test functions, taking no inputs
void TestRegular(Arguments<T> &args, const std::string &name);
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
private:
// Source data to test with
std::vector<T> a_source_;
std::vector<T> x_source_;
std::vector<T> y_source_;
// The routines to test
Routine clblast_lambda_;
Routine clblas_lambda_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
#endif

View file

@ -0,0 +1,189 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the TestBlas class (see the header for information about the class).
//
// =================================================================================================
#include <algorithm>
#include "correctness/testblas.h"
namespace clblast {
// =================================================================================================
// The transpose-options to test with (data-type dependent)
template <> const std::vector<Transpose> TestBlas<float,float>::kTransposes = {Transpose::kNo, Transpose::kYes};
template <> const std::vector<Transpose> TestBlas<double,double>::kTransposes = {Transpose::kNo, Transpose::kYes};
template <> const std::vector<Transpose> TestBlas<float2,float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
template <> const std::vector<Transpose> TestBlas<double2,double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
template <> const std::vector<Transpose> TestBlas<float2,float>::kTransposes = {Transpose::kNo, Transpose::kConjugate};
template <> const std::vector<Transpose> TestBlas<double2,double>::kTransposes = {Transpose::kNo, Transpose::kConjugate};
// =================================================================================================
// Constructor, initializes the base class tester and input data
template <typename T, typename U>
TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine run_routine, const Routine run_reference,
const ResultGet get_result, const ResultIndex get_index,
const ResultIterator get_id1, const ResultIterator get_id2):
Tester<T,U>{argc, argv, silent, name, options},
run_routine_(run_routine),
run_reference_(run_reference),
get_result_(get_result),
get_index_(get_index),
get_id1_(get_id1),
get_id2_(get_id2) {
// Computes the maximum sizes. This allows for a single set of input/output buffers.
auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end());
auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
// Creates test input data
x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
PopulateVector(x_source_);
PopulateVector(y_source_);
PopulateVector(a_source_);
PopulateVector(b_source_);
PopulateVector(c_source_);
}
// ===============================================================================================
// Tests the routine for a wide variety of parameters
template <typename T, typename U>
void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name) {
if (!PrecisionSupported<T>(device_)) { return; }
TestStart("regular behaviour", name);
// Iterates over all the to-be-tested combinations of arguments
for (auto &args: test_vector) {
// Runs the reference clBLAS code
auto x_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
auto y_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
auto a_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
auto b_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
auto c_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
x_vec1.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
y_vec1.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
a_mat1.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
b_mat1.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
c_mat1.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1};
auto status1 = run_reference_(args, buffers1, queue_);
// Runs the CLBlast code
auto x_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
auto y_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
auto a_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
auto b_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
auto c_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
x_vec2.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
y_vec2.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
a_mat2.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
b_mat2.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
c_mat2.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2};
auto status2 = run_routine_(args, buffers2, queue_);
// Tests for equality of the two status codes
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
TestErrorCodes(status1, status2, args);
continue;
}
// Downloads the results
auto result1 = get_result_(args, buffers1, queue_);
auto result2 = get_result_(args, buffers2, queue_);
// Checks for differences in the output
auto errors = size_t{0};
for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
auto index = get_index_(args, id1, id2);
if (!TestSimilarity(result1[index], result2[index])) {
errors++;
}
}
}
// Tests the error count (should be zero)
TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
}
TestEnd();
}
// =================================================================================================
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
// does not test for results (if any).
template <typename T, typename U>
void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name) {
if (!PrecisionSupported<T>(device_)) { return; }
TestStart("invalid buffer sizes", name);
// Iterates over all the to-be-tested combinations of arguments
for (auto &args: test_vector) {
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
// want to be able to create invalid buffers (no error checking here).
auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
auto x_vec1 = Buffer(x1);
auto y_vec1 = Buffer(y1);
auto a_mat1 = Buffer(a1);
auto b_mat1 = Buffer(b1);
auto c_mat1 = Buffer(c1);
auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
auto x_vec2 = Buffer(x2);
auto y_vec2 = Buffer(y2);
auto a_mat2 = Buffer(a2);
auto b_mat2 = Buffer(b2);
auto c_mat2 = Buffer(c2);
// Runs the two routines
auto status1 = run_reference_(args, Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}, queue_);
auto status2 = run_routine_(args, Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}, queue_);
// Tests for equality of the two status codes
TestErrorCodes(status1, status2, args);
}
TestEnd();
}
// =================================================================================================
// Compiles the templated class
template class TestBlas<float, float>;
template class TestBlas<double, double>;
template class TestBlas<float2, float2>;
template class TestBlas<double2, double2>;
template class TestBlas<float2, float>;
template class TestBlas<double2, double>;
// =================================================================================================
} // namespace clblast

106
test/correctness/testblas.h Normal file
View file

@ -0,0 +1,106 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file tests any CLBlast routine. It contains two types of tests: one testing all sorts of
// input combinations, and one deliberatly testing with invalid values.
// Typename T: the data-type of the routine's memory buffers (==precision)
// Typename U: the data-type of the alpha and beta arguments
//
// =================================================================================================
#ifndef CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
#define CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
#include <vector>
#include <string>
#include "correctness/tester.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T, typename U>
class TestBlas: public Tester<T,U> {
public:
// Uses several variables from the Tester class
using Tester<T,U>::context_;
using Tester<T,U>::queue_;
using Tester<T,U>::full_test_;
using Tester<T,U>::device_;
// Uses several helper functions from the Tester class
using Tester<T,U>::TestStart;
using Tester<T,U>::TestEnd;
using Tester<T,U>::TestErrorCount;
using Tester<T,U>::TestErrorCodes;
using Tester<T,U>::GetOffsets;
// Test settings for the regular test. Append to these lists in case more tests are required.
const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
const std::vector<size_t> kIncrements = { 1, 2, 7 };
const std::vector<size_t> kMatrixDims = { 7, 64 };
const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
// Test settings for the invalid tests
const std::vector<size_t> kInvalidIncrements = { 0, 1 };
const size_t kBufferSize = 64;
const std::vector<size_t> kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
const std::vector<size_t> kVecSizes = {0, kBufferSize - 1, kBufferSize};
// The layout/transpose/triangle options to test with
const std::vector<Layout> kLayouts = {Layout::kRowMajor, Layout::kColMajor};
const std::vector<Triangle> kTriangles = {Triangle::kUpper, Triangle::kLower};
const std::vector<Side> kSides = {Side::kLeft, Side::kRight};
const std::vector<Diagonal> kDiagonals = {Diagonal::kUnit, Diagonal::kNonUnit};
static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
// Shorthand for the routine-specific functions passed to the tester
using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers&, CommandQueue&)>;
using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers&, CommandQueue&)>;
using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
using ResultIterator = std::function<size_t(const Arguments<U>&)>;
// Constructor, initializes the base class tester and input data
TestBlas(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine run_routine, const Routine run_reference, const ResultGet get_result,
const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2);
// The test functions, taking no inputs
void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name);
void TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name);
private:
// Source data to test with
std::vector<T> x_source_;
std::vector<T> y_source_;
std::vector<T> a_source_;
std::vector<T> b_source_;
std::vector<T> c_source_;
// The routine-specific functions passed to the tester
Routine run_routine_;
Routine run_reference_;
ResultGet get_result_;
ResultIndex get_index_;
ResultIterator get_id1_;
ResultIterator get_id2_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
#endif

View file

@ -21,21 +21,11 @@
namespace clblast {
// =================================================================================================
// The layouts and transpose-options to test with (data-type dependent)
template <typename T>
const std::vector<Layout> Tester<T>::kLayouts = {Layout::kRowMajor, Layout::kColMajor};
template <> const std::vector<Transpose> Tester<float>::kTransposes = {Transpose::kNo, Transpose::kYes};
template <> const std::vector<Transpose> Tester<double>::kTransposes = {Transpose::kNo, Transpose::kYes};
template <> const std::vector<Transpose> Tester<float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
template <> const std::vector<Transpose> Tester<double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
// =================================================================================================
// General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
// the clBLAS library for reference.
template <typename T>
Tester<T>::Tester(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options):
template <typename T, typename U>
Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options):
help_("Options given/available:\n"),
platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))),
device_(Device(platform_, kDeviceType, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))),
@ -61,7 +51,7 @@ Tester<T>::Tester(int argc, char *argv[], const bool silent,
kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str());
// Checks whether the precision is supported
if (!PrecisionSupported()) {
if (!PrecisionSupported<T>(device_)) {
fprintf(stdout, "\n* All tests skipped: %sUnsupported precision%s\n",
kPrintWarning.c_str(), kPrintEnd.c_str());
return;
@ -86,9 +76,9 @@ Tester<T>::Tester(int argc, char *argv[], const bool silent,
}
// Destructor prints the summary of the test cases and cleans-up the clBLAS library
template <typename T>
Tester<T>::~Tester() {
if (PrecisionSupported()) {
template <typename T, typename U>
Tester<T,U>::~Tester() {
if (PrecisionSupported<T>(device_)) {
fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
fprintf(stdout, " %lu test(s) passed\n", tests_passed_);
if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
@ -104,8 +94,8 @@ Tester<T>::~Tester() {
// Function called at the start of each test. This prints a header with information about the
// test and re-initializes all test data-structures.
template <typename T>
void Tester<T>::TestStart(const std::string &test_name, const std::string &test_configuration) {
template <typename T, typename U>
void Tester<T,U>::TestStart(const std::string &test_name, const std::string &test_configuration) {
// Prints the header
fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n",
@ -123,8 +113,8 @@ void Tester<T>::TestStart(const std::string &test_name, const std::string &test_
// Function called at the end of each test. This prints errors if any occured. It also prints a
// summary of the number of sub-tests passed/failed.
template <typename T>
void Tester<T>::TestEnd() {
template <typename T, typename U>
void Tester<T,U>::TestEnd() {
fprintf(stdout, "\n");
tests_passed_ += num_passed_;
tests_failed_ += num_skipped_;
@ -147,6 +137,7 @@ void Tester<T>::TestEnd() {
if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
if (o == kArgSide) { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
if (o == kArgXInc) { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
if (o == kArgYInc) { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
if (o == kArgXOffset) { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
@ -181,45 +172,9 @@ void Tester<T>::TestEnd() {
// =================================================================================================
// Compares two floating point values and returns whether they are within an acceptable error
// margin. This replaces GTest's EXPECT_NEAR().
template <typename T>
bool Tester<T>::TestSimilarity(const T val1, const T val2) {
const auto difference = std::fabs(val1 - val2);
// Shortcut, handles infinities
if (val1 == val2) {
return true;
}
// The values are zero or very small: the relative error is less meaningful
else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
return (difference < static_cast<T>(kErrorMarginAbsolute));
}
// Use relative error
else {
return (difference / (std::fabs(val1)+std::fabs(val2))) < static_cast<T>(kErrorMarginRelative);
}
}
// Specialisations for complex data-types
template <>
bool Tester<float2>::TestSimilarity(const float2 val1, const float2 val2) {
auto real = Tester<float>::TestSimilarity(val1.real(), val2.real());
auto imag = Tester<float>::TestSimilarity(val1.imag(), val2.imag());
return (real && imag);
}
template <>
bool Tester<double2>::TestSimilarity(const double2 val1, const double2 val2) {
auto real = Tester<double>::TestSimilarity(val1.real(), val2.real());
auto imag = Tester<double>::TestSimilarity(val1.imag(), val2.imag());
return (real && imag);
}
// =================================================================================================
// Handles a 'pass' or 'error' depending on whether there are any errors
template <typename T>
void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args) {
template <typename T, typename U>
void Tester<T,U>::TestErrorCount(const size_t errors, const size_t size, const Arguments<U> &args) {
// Finished successfully
if (errors == 0) {
@ -237,9 +192,9 @@ void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arg
// Compares two status codes for equality. The outcome can be a pass (they are the same), a warning
// (CLBlast reported a compilation error), or an error (they are different).
template <typename T>
void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
const Arguments<T> &args) {
template <typename T, typename U>
void Tester<T,U>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
const Arguments<U> &args) {
// Finished successfully
if (clblas_status == clblast_status) {
@ -270,62 +225,26 @@ void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode
// =================================================================================================
// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
// routines. This function is specialised for the different data-types.
template <>
const std::vector<float> Tester<float>::GetExampleScalars() {
if (full_test_) { return {0.0f, 1.0f, 3.14f}; }
else { return {3.14f}; }
}
template <>
const std::vector<double> Tester<double>::GetExampleScalars() {
if (full_test_) { return {0.0, 1.0, 3.14}; }
else { return {3.14}; }
}
template <>
const std::vector<float2> Tester<float2>::GetExampleScalars() {
if (full_test_) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
else { return {{2.42f, 3.14f}}; }
}
template <>
const std::vector<double2> Tester<double2>::GetExampleScalars() {
if (full_test_) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
else { return {{2.42, 3.14}}; }
}
// Retrieves the offset values to test with
template <typename T>
const std::vector<size_t> Tester<T>::GetOffsets() {
template <typename T, typename U>
const std::vector<size_t> Tester<T,U>::GetOffsets() const {
if (full_test_) { return {0, 10}; }
else { return {0}; }
}
// =================================================================================================
template <> bool Tester<float>::PrecisionSupported() const { return true; }
template <> bool Tester<float2>::PrecisionSupported() const { return true; }
template <> bool Tester<double>::PrecisionSupported() const {
auto extensions = device_.Extensions();
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
}
template <> bool Tester<double2>::PrecisionSupported() const {
auto extensions = device_.Extensions();
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
}
// =================================================================================================
// A test can either pass, be skipped, or fail
template <typename T>
void Tester<T>::ReportPass() {
template <typename T, typename U>
void Tester<T,U>::ReportPass() {
num_passed_++;
}
template <typename T>
void Tester<T>::ReportSkipped() {
template <typename T, typename U>
void Tester<T,U>::ReportSkipped() {
num_skipped_++;
}
template <typename T>
void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
template <typename T, typename U>
void Tester<T,U>::ReportError(const ErrorLogEntry &error_log_entry) {
error_log_.push_back(error_log_entry);
num_failed_++;
}
@ -334,8 +253,8 @@ void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
// Prints the test-result symbol to screen. This function limits the maximum number of symbols per
// line by printing newlines once every so many calls.
template <typename T>
void Tester<T>::PrintTestResult(const std::string &message) {
template <typename T, typename U>
void Tester<T,U>::PrintTestResult(const std::string &message) {
if (print_count_ == kResultsPerLine) {
print_count_ = 0;
fprintf(stdout, "\n ");
@ -345,13 +264,98 @@ void Tester<T>::PrintTestResult(const std::string &message) {
print_count_++;
}
// =================================================================================================
// Below are the non-member functions (separated because of otherwise required partial class
// template specialization)
// =================================================================================================
// Compares two floating point values and returns whether they are within an acceptable error
// margin. This replaces GTest's EXPECT_NEAR().
template <typename T>
bool TestSimilarity(const T val1, const T val2) {
const auto difference = std::fabs(val1 - val2);
// Set the allowed error margin for floating-point comparisons
constexpr auto kErrorMarginRelative = 1.0e-2;
constexpr auto kErrorMarginAbsolute = 1.0e-10;
// Shortcut, handles infinities
if (val1 == val2) {
return true;
}
// The values are zero or very small: the relative error is less meaningful
else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
return (difference < static_cast<T>(kErrorMarginAbsolute));
}
// Use relative error
else {
const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
return (difference / absolute_sum) < static_cast<T>(kErrorMarginRelative);
}
}
// Compiles the default case for non-complex data-types
template bool TestSimilarity<float>(const float, const float);
template bool TestSimilarity<double>(const double, const double);
// Specialisations for complex data-types
template <>
bool TestSimilarity(const float2 val1, const float2 val2) {
auto real = TestSimilarity(val1.real(), val2.real());
auto imag = TestSimilarity(val1.imag(), val2.imag());
return (real && imag);
}
template <>
bool TestSimilarity(const double2 val1, const double2 val2) {
auto real = TestSimilarity(val1.real(), val2.real());
auto imag = TestSimilarity(val1.imag(), val2.imag());
return (real && imag);
}
// =================================================================================================
// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
// routines. This function is specialised for the different data-types.
template <> const std::vector<float> GetExampleScalars(const bool full_test) {
if (full_test) { return {0.0f, 1.0f, 3.14f}; }
else { return {3.14f}; }
}
template <> const std::vector<double> GetExampleScalars(const bool full_test) {
if (full_test) { return {0.0, 1.0, 3.14}; }
else { return {3.14}; }
}
template <> const std::vector<float2> GetExampleScalars(const bool full_test) {
if (full_test) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
else { return {{2.42f, 3.14f}}; }
}
template <> const std::vector<double2> GetExampleScalars(const bool full_test) {
if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
else { return {{2.42, 3.14}}; }
}
// =================================================================================================
// Returns false is this precision is not supported by the device
template <> bool PrecisionSupported<float>(const Device &) { return true; }
template <> bool PrecisionSupported<float2>(const Device &) { return true; }
template <> bool PrecisionSupported<double>(const Device &device) {
auto extensions = device.Extensions();
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
}
template <> bool PrecisionSupported<double2>(const Device &device) {
auto extensions = device.Extensions();
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
}
// =================================================================================================
// Compiles the templated class
template class Tester<float>;
template class Tester<double>;
template class Tester<float2>;
template class Tester<double2>;
template class Tester<float, float>;
template class Tester<double, double>;
template class Tester<float2, float2>;
template class Tester<double2, double2>;
template class Tester<float2, float>;
template class Tester<double2, double>;
// =================================================================================================
} // namespace clblast

View file

@ -10,6 +10,8 @@
// This file implements the Tester class, providing a test-framework. GTest was used before, but
// was not able to handle certain cases (e.g. template type + parameters). This is its (basic)
// custom replacement.
// Typename T: the data-type of the routine's memory buffers (==precision)
// Typename U: the data-type of the alpha and beta arguments
//
// =================================================================================================
@ -30,7 +32,7 @@ namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
template <typename T, typename U>
class Tester {
public:
@ -43,10 +45,6 @@ class Tester {
// Error percentage is not applicable: error was caused by an incorrect status
static constexpr auto kStatusError = -1.0f;
// Set the allowed error margin for floating-point comparisons
static constexpr auto kErrorMarginRelative = 1.0e-2;
static constexpr auto kErrorMarginAbsolute = 1.0e-10;
// Constants holding start and end strings for terminal-output in colour
const std::string kPrintError{"\x1b[31m"};
const std::string kPrintSuccess{"\x1b[32m"};
@ -62,16 +60,12 @@ class Tester {
const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};
// The layouts and transpose-options to test with
static const std::vector<Layout> kLayouts;
static const std::vector<Transpose> kTransposes;
// This structure combines the above log-entry with a status code an error percentage
struct ErrorLogEntry {
StatusCode status_expect;
StatusCode status_found;
float error_percentage;
Arguments<T> args;
Arguments<U> args;
};
// Creates an instance of the tester, running on a particular OpenCL platform and device. It
@ -84,25 +78,13 @@ class Tester {
void TestStart(const std::string &test_name, const std::string &test_configuration);
void TestEnd();
// Compares two floating point values for similarity. Allows for a certain relative error margin.
static bool TestSimilarity(const T val1, const T val2);
// Tests either an error count (should be zero) or two error codes (must match)
void TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args);
void TestErrorCount(const size_t errors, const size_t size, const Arguments<U> &args);
void TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
const Arguments<T> &args);
const Arguments<U> &args);
protected:
// Retrieves a list of example scalars of the right type
const std::vector<T> GetExampleScalars();
// Retrieves a list of offset values to test
const std::vector<size_t> GetOffsets();
// Returns false is this precision is not supported by the device
bool PrecisionSupported() const;
// The help-message
std::string help_;
@ -112,6 +94,12 @@ class Tester {
Context context_;
CommandQueue queue_;
// Whether or not to run the full test-suite or just a smoke test
bool full_test_;
// Retrieves the offset values to test with
const std::vector<size_t> GetOffsets() const;
private:
// Internal methods to report a passed, skipped, or failed test
@ -122,9 +110,6 @@ class Tester {
// Prints the error or success symbol to screen
void PrintTestResult(const std::string &message);
// Whether or not to run the full test-suite or just a smoke test
bool full_test_;
// Logging and counting occurrences of errors
std::vector<ErrorLogEntry> error_log_;
size_t num_passed_;
@ -143,6 +128,25 @@ class Tester {
std::vector<std::string> options_;
};
// =================================================================================================
// Below are the non-member functions (separated because of otherwise required partial class
// template specialization)
// =================================================================================================
// Compares two floating point values and returns whether they are within an acceptable error
// margin. This replaces GTest's EXPECT_NEAR().
template <typename T>
bool TestSimilarity(const T val1, const T val2);
// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
// routines. This function is specialised for the different data-types.
template <typename T>
const std::vector<T> GetExampleScalars(const bool full_test);
// Returns false is this precision is not supported by the device
template <typename T>
bool PrecisionSupported(const Device &device);
// =================================================================================================
} // namespace clblast

View file

@ -1,176 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the TestXY class (see the header for information about the class).
//
// =================================================================================================
#include <algorithm>
#include "correctness/testxy.h"
namespace clblast {
// =================================================================================================
// Constructor, initializes the base class tester and input data
template <typename T>
TestXY<T>::TestXY(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda):
Tester<T>{argc, argv, silent, name, options},
clblast_lambda_(clblast_lambda),
clblas_lambda_(clblas_lambda) {
// Computes the maximum sizes. This allows for a single set of input/output buffers.
auto max_dim = *std::max_element(kVectorDims.begin(), kVectorDims.end());
auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
// Creates test input data
x_source_.resize(max_dim*max_inc + max_offset);
y_source_.resize(max_dim*max_inc + max_offset);
PopulateVector(x_source_);
PopulateVector(y_source_);
}
// ===============================================================================================
// Tests the routine for a wide variety of parameters
template <typename T>
void TestXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("regular behaviour", name);
// Iterates over the vector dimension
for (auto &n: kVectorDims) {
args.n = n;
// Iterates over the increment-values and the offsets
for (auto &x_inc: kIncrements) {
args.x_inc = x_inc;
for (auto &x_offset: kOffsets) {
args.x_offset = x_offset;
for (auto &y_inc: kIncrements) {
args.y_inc = y_inc;
for (auto &y_offset: kOffsets) {
args.y_offset = y_offset;
// Computes the buffer sizes
auto x_size = n * x_inc + x_offset;
auto y_size = n * y_inc + y_offset;
if (x_size < 1 || y_size < 1) { continue; }
// Creates the OpenCL buffers
auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
// Iterates over the values for alpha
for (auto &alpha: kAlphaValues) {
args.alpha = alpha;
// Runs the reference clBLAS code
x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
// Runs the CLBlast code
x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
// Tests for equality of the two status codes
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
TestErrorCodes(status1, status2, args);
continue;
}
// Downloads the results
std::vector<T> r_result(y_size, static_cast<T>(0));
std::vector<T> s_result(y_size, static_cast<T>(0));
r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
// Checks for differences in the output
auto errors = size_t{0};
for (auto idn=size_t{0}; idn<n; ++idn) {
auto index = idn*y_inc + y_offset;
if (!TestSimilarity(r_result[index], s_result[index])) {
errors++;
}
}
// Tests the error count (should be zero)
TestErrorCount(errors, n, args);
}
}
}
}
}
}
TestEnd();
}
// =================================================================================================
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
// does not test for results (if any).
template <typename T>
void TestXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
if (!PrecisionSupported()) { return; }
TestStart("invalid buffer sizes", name);
// Sets example test parameters
args.n = kBufferSize;
args.x_offset = 0;
args.y_offset = 0;
// Iterates over test buffer sizes
const std::vector<size_t> kBufferSizes = {0, kBufferSize - 1, kBufferSize};
for (auto &x_size: kBufferSizes) {
for (auto &y_size: kBufferSizes) {
// Iterates over test increments
for (auto &x_inc: kInvalidIncrements) {
args.x_inc = x_inc;
for (auto &y_inc: kInvalidIncrements) {
args.y_inc = y_inc;
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
// want to be able to create invalid buffers (no error checking here).
auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
auto x_vec = Buffer(x);
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
auto r_vec = Buffer(r);
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
auto s_vec = Buffer(s);
// Runs the two routines
auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
// Tests for equality of the two status codes
TestErrorCodes(status1, status2, args);
}
}
}
}
TestEnd();
}
// =================================================================================================
// Compiles the templated class
template class TestXY<float>;
template class TestXY<double>;
template class TestXY<float2>;
template class TestXY<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -1,84 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file tests any vector-vector (X,Y) routine. It contains two types of tests: one testing
// all sorts of input combinations, and one deliberatly testing with invalid values.
//
// =================================================================================================
#ifndef CLBLAST_TEST_CORRECTNESS_TESTXY_H_
#define CLBLAST_TEST_CORRECTNESS_TESTXY_H_
#include <vector>
#include <string>
#include "correctness/tester.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestXY: public Tester<T> {
public:
// Uses several variables from the Tester class
using Tester<T>::context_;
using Tester<T>::queue_;
// Uses several helper functions from the Tester class
using Tester<T>::TestStart;
using Tester<T>::TestEnd;
using Tester<T>::TestSimilarity;
using Tester<T>::TestErrorCount;
using Tester<T>::TestErrorCodes;
using Tester<T>::GetExampleScalars;
using Tester<T>::GetOffsets;
using Tester<T>::PrecisionSupported;
// Test settings for the regular test. Append to this list in case more tests are required.
const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
const std::vector<size_t> kOffsets = GetOffsets();
const std::vector<size_t> kIncrements = { 1, 2, 7 };
const std::vector<T> kAlphaValues = GetExampleScalars();
// Test settings for the invalid test
const std::vector<size_t> kInvalidIncrements = { 0, 1 };
const size_t kBufferSize = 512;
// Shorthand for a BLAS routine
using Routine = std::function<StatusCode(const Arguments<T>&,
const Buffer&, const Buffer&,
CommandQueue&)>;
// Constructor, initializes the base class tester and input data
TestXY(int argc, char *argv[], const bool silent,
const std::string &name, const std::vector<std::string> &options,
const Routine clblast_lambda, const Routine clblas_lambda);
// The test functions, taking no inputs
void TestRegular(Arguments<T> &args, const std::string &name);
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
private:
// Source data to test with
std::vector<T> x_source_;
std::vector<T> y_source_;
// The routines to test
Routine clblast_lambda_;
Routine clblas_lambda_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_CORRECTNESS_TESTXY_H_
#endif

View file

@ -21,249 +21,36 @@
namespace clblast {
// =================================================================================================
// This is the vector-vector variant of the set-up/tear-down client routine.
template <typename T>
void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
const std::vector<std::string> &options) {
// Function to determine how to find the default value of the leading dimension of matrix A.
// Note: this is not relevant for this client but given anyway.
auto default_ld_a = [](const Arguments<T> args) { return args.n; };
// Simple command line argument parser with defaults
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
if (args.print_help) { return; }
// Prints the header of the output table
PrintTableHeader(args.silent, options);
// Initializes OpenCL and the libraries
auto platform = Platform(args.platform_id);
auto device = Device(platform, kDeviceType, args.device_id);
auto context = Context(device);
auto queue = CommandQueue(context, device);
if (args.compare_clblas) { clblasSetup(); }
// Iterates over all "num_step" values jumping by "step" each time
auto s = size_t{0};
while(true) {
// Computes the data sizes
auto x_size = args.n*args.x_inc + args.x_offset;
auto y_size = args.n*args.y_inc + args.y_offset;
// Populates input host vectors with random data
std::vector<T> x_source(x_size);
std::vector<T> y_source(y_size);
PopulateVector(x_source);
PopulateVector(y_source);
// Creates the vectors on the device
auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
// Runs the routine-specific code
client_routine(args, x_buffer, y_buffer, queue);
// Makes the jump to the next step
++s;
if (s >= args.num_steps) { break; }
args.n += args.step;
}
// Cleans-up and returns
if (args.compare_clblas) { clblasTeardown(); }
// Constructor
template <typename T, typename U>
Client<T,U>::Client(const Routine run_routine, const Routine run_reference,
const std::vector<std::string> &options,
const GetMetric get_flops, const GetMetric get_bytes):
run_routine_(run_routine),
run_reference_(run_reference),
options_(options),
get_flops_(get_flops),
get_bytes_(get_bytes) {
}
// Compiles the above function
template void ClientXY<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
template void ClientXY<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
template void ClientXY<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
template void ClientXY<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
// =================================================================================================
// This is the matrix-vector-vector variant of the set-up/tear-down client routine.
template <typename T>
void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options) {
// Function to determine how to find the default value of the leading dimension of matrix A
auto default_ld_a = [](const Arguments<T> args) { return args.n; };
// Simple command line argument parser with defaults
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
if (args.print_help) { return; }
// Prints the header of the output table
PrintTableHeader(args.silent, options);
// Initializes OpenCL and the libraries
auto platform = Platform(args.platform_id);
auto device = Device(platform, kDeviceType, args.device_id);
auto context = Context(device);
auto queue = CommandQueue(context, device);
if (args.compare_clblas) { clblasSetup(); }
// Iterates over all "num_step" values jumping by "step" each time
auto s = size_t{0};
while(true) {
// Computes the second dimension of the matrix taking the rotation into account
auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
// Computes the vector sizes in case the matrix is transposed
auto a_transposed = (args.a_transpose == Transpose::kYes);
auto m_real = (a_transposed) ? args.n : args.m;
auto n_real = (a_transposed) ? args.m : args.n;
// Computes the data sizes
auto a_size = a_two * args.a_ld + args.a_offset;
auto x_size = n_real*args.x_inc + args.x_offset;
auto y_size = m_real*args.y_inc + args.y_offset;
// Populates input host vectors with random data
std::vector<T> a_source(a_size);
std::vector<T> x_source(x_size);
std::vector<T> y_source(y_size);
PopulateVector(a_source);
PopulateVector(x_source);
PopulateVector(y_source);
// Creates the vectors on the device
auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
// Runs the routine-specific code
client_routine(args, a_buffer, x_buffer, y_buffer, queue);
// Makes the jump to the next step
++s;
if (s >= args.num_steps) { break; }
args.m += args.step;
args.n += args.step;
args.a_ld += args.step;
}
// Cleans-up and returns
if (args.compare_clblas) { clblasTeardown(); }
}
// Compiles the above function
template void ClientAXY<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
template void ClientAXY<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
template void ClientAXY<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
template void ClientAXY<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
// =================================================================================================
// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
template <typename T>
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options) {
// Function to determine how to find the default value of the leading dimension of matrix A
auto default_ld_a = [](const Arguments<T> args) { return args.m; };
// Simple command line argument parser with defaults
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
if (args.print_help) { return; }
// Prints the header of the output table
PrintTableHeader(args.silent, options);
// Initializes OpenCL and the libraries
auto platform = Platform(args.platform_id);
auto device = Device(platform, kDeviceType, args.device_id);
auto context = Context(device);
auto queue = CommandQueue(context, device);
if (args.compare_clblas) { clblasSetup(); }
// Computes whether or not the matrices are transposed. Note that we assume a default of
// column-major and no-transpose. If one of them is different (but not both), then rotated
// is considered true.
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) ||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) ||
(args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
auto c_rotated = (args.layout == Layout::kRowMajor);
// Iterates over all "num_step" values jumping by "step" each time
auto s = size_t{0};
while(true) {
// Computes the data sizes
auto a_two = (a_rotated) ? args.m : args.k;
auto b_two = (b_rotated) ? args.k : args.n;
auto c_two = (c_rotated) ? args.m : args.n;
auto a_size = a_two * args.a_ld + args.a_offset;
auto b_size = b_two * args.b_ld + args.b_offset;
auto c_size = c_two * args.c_ld + args.c_offset;
// Populates input host matrices with random data
std::vector<T> a_source(a_size);
std::vector<T> b_source(b_size);
std::vector<T> c_source(c_size);
PopulateVector(a_source);
PopulateVector(b_source);
PopulateVector(c_source);
// Creates the matrices on the device
auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
auto b_buffer = Buffer(context, CL_MEM_READ_WRITE, b_size*sizeof(T));
auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
b_buffer.WriteBuffer(queue, b_size*sizeof(T), b_source);
c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
// Runs the routine-specific code
client_routine(args, a_buffer, b_buffer, c_buffer, queue);
// Makes the jump to the next step
++s;
if (s >= args.num_steps) { break; }
args.m += args.step;
args.n += args.step;
args.k += args.step;
args.a_ld += args.step;
args.b_ld += args.step;
args.c_ld += args.step;
}
// Cleans-up and returns
if (args.compare_clblas) { clblasTeardown(); }
}
// Compiles the above function
template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
// =================================================================================================
// Parses all arguments available for the CLBlast client testers. Some arguments might not be
// applicable, but are searched for anyway to be able to create one common argument parser. All
// arguments have a default value in case they are not found.
template <typename T>
Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
const std::function<size_t(const Arguments<T>)> default_ld_a) {
auto args = Arguments<T>{};
template <typename T, typename U>
Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
const GetMetric default_b_ld, const GetMetric default_c_ld) {
auto args = Arguments<U>{};
auto help = std::string{"Options given/available:\n"};
// These are the options which are not for every client: they are optional
for (auto &o: options) {
for (auto &o: options_) {
// Data-sizes
if (o == kArgM) { args.m = args.k = GetArgument(argc, argv, help, kArgM, 512UL); }
if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); }
if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); }
if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); }
if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); }
if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); }
// Data-layouts
if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@ -271,6 +58,7 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
if (o == kArgBTransp) { args.b_transpose = GetArgument(argc, argv, help, kArgBTransp, Transpose::kNo); }
if (o == kArgSide) { args.side = GetArgument(argc, argv, help, kArgSide, Side::kLeft); }
if (o == kArgTriangle) { args.triangle = GetArgument(argc, argv, help, kArgTriangle, Triangle::kUpper); }
if (o == kArgDiagonal) { args.diagonal = GetArgument(argc, argv, help, kArgDiagonal, Diagonal::kUnit); }
// Vector arguments
if (o == kArgXInc) { args.x_inc = GetArgument(argc, argv, help, kArgXInc, size_t{1}); }
@ -279,16 +67,16 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); }
// Matrix arguments
if (o == kArgALeadDim) { args.a_ld = GetArgument(argc, argv, help, kArgALeadDim, default_ld_a(args)); }
if (o == kArgBLeadDim) { args.b_ld = GetArgument(argc, argv, help, kArgBLeadDim, args.n); }
if (o == kArgCLeadDim) { args.c_ld = GetArgument(argc, argv, help, kArgCLeadDim, args.n); }
if (o == kArgALeadDim) { args.a_ld = GetArgument(argc, argv, help, kArgALeadDim, default_a_ld(args)); }
if (o == kArgBLeadDim) { args.b_ld = GetArgument(argc, argv, help, kArgBLeadDim, default_b_ld(args)); }
if (o == kArgCLeadDim) { args.c_ld = GetArgument(argc, argv, help, kArgCLeadDim, default_c_ld(args)); }
if (o == kArgAOffset) { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }
if (o == kArgBOffset) { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); }
if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }
// Scalar values
if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); }
if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<U>()); }
if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<U>()); }
}
// These are the options common to all routines
@ -313,16 +101,92 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
// =================================================================================================
// This is main performance tester
template <typename T, typename U>
void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {
// Prints the header of the output table
PrintTableHeader(args.silent, options_);
// Initializes OpenCL and the libraries
auto platform = Platform(args.platform_id);
auto device = Device(platform, kDeviceType, args.device_id);
auto context = Context(device);
auto queue = CommandQueue(context, device);
if (args.compare_clblas) { clblasSetup(); }
// Iterates over all "num_step" values jumping by "step" each time
auto s = size_t{0};
while(true) {
// Sets the buffer sizes (routine-specific)
set_sizes(args);
// Populates input host matrices with random data
std::vector<T> x_source(args.x_size);
std::vector<T> y_source(args.y_size);
std::vector<T> a_source(args.a_size);
std::vector<T> b_source(args.b_size);
std::vector<T> c_source(args.c_size);
PopulateVector(x_source);
PopulateVector(y_source);
PopulateVector(a_source);
PopulateVector(b_source);
PopulateVector(c_source);
// Creates the matrices on the device
auto x_vec = Buffer(context, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
auto y_vec = Buffer(context, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
auto a_mat = Buffer(context, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
auto b_mat = Buffer(context, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
auto c_mat = Buffer(context, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
x_vec.WriteBuffer(queue, args.x_size*sizeof(T), x_source);
y_vec.WriteBuffer(queue, args.y_size*sizeof(T), y_source);
a_mat.WriteBuffer(queue, args.a_size*sizeof(T), a_source);
b_mat.WriteBuffer(queue, args.b_size*sizeof(T), b_source);
c_mat.WriteBuffer(queue, args.c_size*sizeof(T), c_source);
auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat};
// Runs the routines and collects the timings
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
// Prints the performance of both libraries
PrintTableRow(args, ms_clblast, ms_clblas);
// Makes the jump to the next step
++s;
if (s >= args.num_steps) { break; }
args.m += args.step;
args.n += args.step;
args.k += args.step;
args.a_ld += args.step;
args.b_ld += args.step;
args.c_ld += args.step;
}
// Cleans-up and returns
if (args.compare_clblas) { clblasTeardown(); }
}
// =================================================================================================
// Creates a vector of timing results, filled with execution times of the 'main computation'. The
// timing is performed using the milliseconds chrono functions. The function returns the minimum
// value found in the vector of timing results. The return value is in milliseconds.
double TimedExecution(const size_t num_runs, std::function<void()> main_computation) {
template <typename T, typename U>
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
const Buffers &buffers, CommandQueue &queue,
Routine run_blas, const std::string &library_name) {
auto timings = std::vector<double>(num_runs);
for (auto &timing: timings) {
auto start_time = std::chrono::steady_clock::now();
// Executes the main computation
main_computation();
auto status = run_blas(args, buffers, queue);
if (status != StatusCode::kSuccess) {
throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
}
// Records and stores the end-time
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
@ -334,7 +198,8 @@ double TimedExecution(const size_t num_runs, std::function<void()> main_computat
// =================================================================================================
// Prints the header of the performance table
void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
template <typename T, typename U>
void Client<T,U>::PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
if (!silent) {
for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
fprintf(stdout, " | <-- CLBlast --> | <-- clBLAS --> |\n");
@ -345,29 +210,60 @@ void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
}
// Print a performance-result row
void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
const bool no_abbrv, const double ms_clblast, const double ms_clblas,
const unsigned long long flops, const unsigned long long bytes) {
template <typename T, typename U>
void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblast,
const double ms_clblas) {
// Creates a vector of relevant variables
auto integers = std::vector<size_t>{};
for (auto &o: options_) {
if (o == kArgM) { integers.push_back(args.m); }
if (o == kArgN) { integers.push_back(args.n); }
else if (o == kArgK) { integers.push_back(args.k); }
else if (o == kArgLayout) { integers.push_back(static_cast<size_t>(args.layout)); }
else if (o == kArgSide) { integers.push_back(static_cast<size_t>(args.side)); }
else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); }
else if (o == kArgATransp) { integers.push_back(static_cast<size_t>(args.a_transpose)); }
else if (o == kArgBTransp) { integers.push_back(static_cast<size_t>(args.b_transpose)); }
else if (o == kArgDiagonal) { integers.push_back(static_cast<size_t>(args.diagonal)); }
else if (o == kArgXInc) { integers.push_back(args.x_inc); }
else if (o == kArgYInc) { integers.push_back(args.y_inc); }
else if (o == kArgXOffset) { integers.push_back(args.x_offset); }
else if (o == kArgYOffset) { integers.push_back(args.y_offset); }
else if (o == kArgALeadDim) { integers.push_back(args.a_ld); }
else if (o == kArgBLeadDim) { integers.push_back(args.b_ld); }
else if (o == kArgCLeadDim) { integers.push_back(args.c_ld); }
else if (o == kArgAOffset) { integers.push_back(args.a_offset); }
else if (o == kArgBOffset) { integers.push_back(args.b_offset); }
else if (o == kArgCOffset) { integers.push_back(args.c_offset); }
}
auto strings = std::vector<std::string>{};
for (auto &o: options_) {
if (o == kArgAlpha) { strings.push_back(ToString(args.alpha)); }
else if (o == kArgBeta) { strings.push_back(ToString(args.beta)); }
}
// Computes the GFLOPS and GB/s metrics
auto flops = get_flops_(args);
auto bytes = get_bytes_(args);
auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;
// Outputs the argument values
for (auto &argument: args_int) {
if (!no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
for (auto &argument: integers) {
if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
fprintf(stdout, "%8luM;", argument/(1024*1024));
}
else if (!no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
fprintf(stdout, "%8luK;", argument/1024);
}
else {
fprintf(stdout, "%9lu;", argument);
}
}
for (auto &argument: args_string) {
for (auto &argument: strings) {
fprintf(stdout, "%9s;", argument.c_str());
}
@ -377,5 +273,15 @@ void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::s
ms_clblas, gflops_clblas, gbs_clblas);
}
// =================================================================================================
// Compiles the templated class
template class Client<float,float>;
template class Client<double,double>;
template class Client<float2,float2>;
template class Client<double2,double2>;
template class Client<float2,float>;
template class Client<double2,double>;
// =================================================================================================
} // namespace clblast

View file

@ -7,7 +7,14 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file provides common function declarations to be used with the test clients.
// This class implements the performance-test client. It is generic for all CLBlast routines by
// taking a number of routine-specific functions as arguments, such as how to compute buffer sizes
// or how to get the FLOPS count.
// Typename T: the data-type of the routine's memory buffers (==precision)
// Typename U: the data-type of the alpha and beta arguments
//
// This file also provides the common interface to the performance client (see the 'RunClient'
// function for details).
//
// =================================================================================================
@ -26,61 +33,71 @@
namespace clblast {
// =================================================================================================
// Types of devices to consider
const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
// See comment at top of file for a description of the class
template <typename T, typename U>
class Client {
public:
// Types of devices to consider
const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
// Shorthand for the routine-specific functions passed to the tester
using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers&, CommandQueue&)>;
using SetMetric = std::function<void(Arguments<U>&)>;
using GetMetric = std::function<size_t(const Arguments<U>&)>;
// The constructor
Client(const Routine run_routine, const Routine run_reference,
const std::vector<std::string> &options,
const GetMetric get_flops, const GetMetric get_bytes);
// Parses all command-line arguments, filling in the arguments structure. If no command-line
// argument is given for a particular argument, it is filled in with a default value.
Arguments<U> ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
const GetMetric default_b_ld, const GetMetric default_c_ld);
// The main client function, setting-up arguments, matrices, OpenCL buffers, etc. After set-up, it
// calls the client routines.
void PerformanceTest(Arguments<U> &args, const SetMetric set_sizes);
private:
// Runs a function a given number of times and returns the execution time of the shortest instance
double TimedExecution(const size_t num_runs, const Arguments<U> &args, const Buffers &buffers,
CommandQueue &queue, Routine run_blas, const std::string &library_name);
// Prints the header of a performance-data table
void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
// Prints a row of performance data, including results of two libraries
void PrintTableRow(const Arguments<U>& args, const double ms_clblast, const double ms_clblas);
// The routine-specific functions passed to the tester
const Routine run_routine_;
const Routine run_reference_;
const std::vector<std::string> options_;
const GetMetric get_flops_;
const GetMetric get_bytes_;
};
// =================================================================================================
// Shorthand for a BLAS routine with 2 or 3 OpenCL buffers as argument
template <typename T>
using Routine2 = std::function<void(const Arguments<T>&,
const Buffer&, const Buffer&,
CommandQueue&)>;
template <typename T>
using Routine3 = std::function<void(const Arguments<T>&,
const Buffer&, const Buffer&, const Buffer&,
CommandQueue&)>;
// The interface to the performance client. This is a separate function in the header such that it
// is automatically compiled for each routine, templated by the parameter "C".
template <typename C, typename T, typename U>
void RunClient(int argc, char *argv[]) {
// =================================================================================================
// Creates a new client
auto client = Client<T,U>(C::RunRoutine, C::RunReference, C::GetOptions(),
C::GetFlops, C::GetBytes);
// These are the main client functions, setting-up arguments, matrices, OpenCL buffers, etc. After
// set-up, they call the client routine, passed as argument to this function.
template <typename T>
void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
const std::vector<std::string> &options);
template <typename T>
void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options);
template <typename T>
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
const std::vector<std::string> &options);
// Simple command line argument parser with defaults
auto args = client.ParseArguments(argc, argv, C::DefaultLDA, C::DefaultLDB, C::DefaultLDC);
if (args.print_help) { return; }
// =================================================================================================
// Parses all command-line arguments, filling in the arguments structure. If no command-line
// argument is given for a particular argument, it is filled in with a default value.
template <typename T>
Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
const std::function<size_t(const Arguments<T>)> default_ld_a);
// Retrieves only the precision command-line argument, since the above function is templated based
// on the precision
Precision GetPrecision(int argc, char *argv[]);
// =================================================================================================
// Runs a function a given number of times and returns the execution time of the shortest instance
double TimedExecution(const size_t num_runs, std::function<void()> main_computation);
// =================================================================================================
// Prints the header of a performance-data table
void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
// Prints a row of performance data, including results of two libraries
void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
const bool abbreviations, const double ms_clblast, const double ms_clblas,
const unsigned long long flops, const unsigned long long bytes);
// Runs the client
client.PerformanceTest(args, C::SetSizes);
}
// =================================================================================================
} // namespace clblast

View file

@ -83,7 +83,16 @@ main <- function(routine_name, precision, test_names, test_values,
params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
arguments <- paste(devices_string, params_string, options_string, sep=" ")
print(paste("Running", executable, arguments, sep=" "))
result_string <- system2(command=executable, args=arguments, stdout=TRUE)
raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
# Filter the string: only lines containing a ";" can be valid lines
result_string <- c()
for (line in raw_result_string) {
if (grepl(";",line)) {
result_string <-
c(result_string, line)
}
}
# Reads the result into a dataframe
command_db <- read.csv(text=result_string, sep=";")

View file

@ -35,10 +35,10 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
list(c(128, 128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
list(c(129, 129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
list(c(512, 512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
list(c(2048, 2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
list(c( 128, 128, 128, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 129, 129, 129, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 512, 512, 512, 1, 0, 0, 16, 1, num_runs, precision)),
list(c(2048, 2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
@ -50,17 +50,17 @@ test_values <- list(
c(1024, 1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
),
list(
c(8, 8, 8, 0, 0, 0, 1, 0, num_runs, precision),
c(16, 16, 16, 0, 0, 0, 1, 0, num_runs, precision),
c(32, 32, 32, 0, 0, 0, 1, 0, num_runs, precision),
c(64, 64, 64, 0, 0, 0, 1, 0, num_runs, precision),
c(128, 128, 128, 0, 0, 0, 1, 0, num_runs, precision),
c(256, 256, 256, 0, 0, 0, 1, 0, num_runs, precision),
c(512, 512, 512, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
c( 8, 8, 8, 1, 0, 0, 1, 0, num_runs, precision),
c( 16, 16, 16, 1, 0, 0, 1, 0, num_runs, precision),
c( 32, 32, 32, 1, 0, 0, 1, 0, num_runs, precision),
c( 64, 64, 64, 1, 0, 0, 1, 0, num_runs, precision),
c( 128, 128, 128, 1, 0, 0, 1, 0, num_runs, precision),
c( 256, 256, 256, 1, 0, 0, 1, 0, num_runs, precision),
c( 512, 512, 512, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
)
)

View file

@ -19,7 +19,7 @@ source(file.path(dirname(thisfile), "common.r"))
# Settings
routine_name <- "xsymm"
parameters <- c("-m","-n","-layout","-triangle","-side",
parameters <- c("-m","-n","-layout","-side","-triangle",
"-num_steps","-step","-runs","-precision")
precision <- 32
@ -29,16 +29,16 @@ test_names <- list(
"multiples of 128 (+1)",
"around m=n=512",
"around m=n=2048",
"layouts and triangle/side (m=n=1024)",
"layouts and side/triangle (m=n=1024)",
"powers of 2"
)
# Defines the test-cases
test_values <- list(
list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)),
list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
@ -50,17 +50,17 @@ test_values <- list(
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
),
list(
c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision),
c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision),
c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision),
c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision),
c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision),
c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision),
c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
)
)
@ -70,7 +70,7 @@ test_xlabels <- list(
"matrix sizes (m=n)",
"matrix sizes (m=n)",
"matrix sizes (m=n)",
"layout (row/col), triangle (up/lo), side (l/r)",
"layout (row/col), side (l/r), triangle (up/lo)",
"matrix sizes (m=n)"
)
@ -80,8 +80,8 @@ test_xaxis <- list(
c("m", ""),
c("m", ""),
c("m", ""),
list(1:8, c("row,up,l", "row,up,r", "row,lo,l", "row,lo,r",
"col,up,l", "col,up,r", "col,lo,l", "col,lo,r")),
list(1:8, c("row,l,up", "row,r,up", "row,l,lo", "row,r,lo",
"col,l,up", "col,r,up", "col,l,lo", "col,r,lo")),
c("m", "x")
)

View file

@ -0,0 +1,94 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xsyr2k routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xsyr2k"
parameters <- c("-n","-k","-layout","-triangle","-transA",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 128",
"multiples of 128 (+1)",
"around n=k=512",
"around n=k=1536",
"layouts and transposing (n=k=1024)",
"powers of 2"
)
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)),
list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)),
list(c(1536, 1536, 1, 0, 0, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision),
c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision),
c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision),
c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision),
c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision),
c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision),
c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision),
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"layout (row/col), triangle (u/l), transA (n/y)",
"matrix sizes (n=k)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("n", ""),
c("n", ""),
c("n", ""),
c("n", ""),
list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
"col,u,n", "col,u,y", "col,l,n", "col,l,y")),
c("n", "x")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
# ==================================================================================================

Some files were not shown because too many files have changed in this diff Show more