mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-07-04 21:36:57 +02:00
Merge pull request #15 from CNugteren/development
Update to version 0.3.0
This commit is contained in:
commit
db6846b791
12
CHANGELOG
12
CHANGELOG
|
@ -1,4 +1,16 @@
|
||||||
|
|
||||||
|
Version 0.3.0
|
||||||
|
- Re-organized test/client infrastructure to avoid code duplication
|
||||||
|
- Added an optional bypass for pre/post-processing kernels in level-3 routines
|
||||||
|
- Significantly improved performance of level-3 routines on AMD GPUs
|
||||||
|
- Added level-3 routines:
|
||||||
|
* CHEMM/ZHEMM
|
||||||
|
* SSYRK/DSYRK/CSYRK/ZSYRK
|
||||||
|
* CHERK/ZHERK
|
||||||
|
* SSYR2K/DSYR2K/CSYR2K/ZSYR2K
|
||||||
|
* CHER2K/ZHER2K
|
||||||
|
* STRMM/DTRMM/CTRMM/ZTRMM
|
||||||
|
|
||||||
Version 0.2.0
|
Version 0.2.0
|
||||||
- Added support for complex conjugate transpose
|
- Added support for complex conjugate transpose
|
||||||
- Several host-code performance improvements
|
- Several host-code performance improvements
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
cmake_minimum_required(VERSION 2.8.10)
|
cmake_minimum_required(VERSION 2.8.10)
|
||||||
project("clblast" CXX)
|
project("clblast" CXX)
|
||||||
set(clblast_VERSION_MAJOR 0)
|
set(clblast_VERSION_MAJOR 0)
|
||||||
set(clblast_VERSION_MINOR 2)
|
set(clblast_VERSION_MINOR 3)
|
||||||
set(clblast_VERSION_PATCH 0)
|
set(clblast_VERSION_PATCH 0)
|
||||||
|
|
||||||
# Options and their default values
|
# Options and their default values
|
||||||
|
@ -95,17 +95,23 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
|
||||||
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
|
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
|
||||||
set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
|
set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
|
||||||
set(SAMPLE_PROGRAMS sgemm)
|
set(SAMPLE_PROGRAMS sgemm)
|
||||||
set(ROUTINES_XY xaxpy)
|
set(LEVEL1_ROUTINES xaxpy)
|
||||||
set(ROUTINES_AXY xgemv)
|
set(LEVEL2_ROUTINES xgemv)
|
||||||
set(ROUTINES_ABC xgemm xsymm)
|
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
|
||||||
set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC})
|
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Gathers all source-files
|
# Gathers all source-files
|
||||||
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc)
|
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc)
|
||||||
foreach(ROUTINE ${ROUTINES})
|
foreach(ROUTINE ${LEVEL1_ROUTINES})
|
||||||
set(SOURCES ${SOURCES} src/routines/${ROUTINE}.cc)
|
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
|
||||||
|
endforeach()
|
||||||
|
foreach(ROUTINE ${LEVEL2_ROUTINES})
|
||||||
|
set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc)
|
||||||
|
endforeach()
|
||||||
|
foreach(ROUTINE ${LEVEL3_ROUTINES})
|
||||||
|
set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
# Creates and links the library
|
# Creates and links the library
|
||||||
|
@ -168,33 +174,23 @@ if(TESTS)
|
||||||
include_directories(${clblast_SOURCE_DIR}/test ${clBLAS_SOURCE_DIR})
|
include_directories(${clblast_SOURCE_DIR}/test ${clBLAS_SOURCE_DIR})
|
||||||
|
|
||||||
# Creates the common correctness-tests objects (requires CMake 2.8.8)
|
# Creates the common correctness-tests objects (requires CMake 2.8.8)
|
||||||
add_library(test_correctness_common OBJECT test/correctness/tester.cc)
|
add_library(test_correctness_common OBJECT
|
||||||
add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
|
test/correctness/tester.cc test/correctness/testblas.cc)
|
||||||
add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
|
|
||||||
add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)
|
|
||||||
|
|
||||||
# Compiles the correctness-tests
|
# Compiles the correctness-tests
|
||||||
foreach(ROUTINE ${ROUTINES_XY})
|
foreach(ROUTINE ${LEVEL1_ROUTINES})
|
||||||
add_executable(test_${ROUTINE}
|
add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
|
||||||
$<TARGET_OBJECTS:test_correctness_common>
|
test/correctness/routines/level1/${ROUTINE}.cc)
|
||||||
$<TARGET_OBJECTS:test_correctness_xy>
|
|
||||||
test/correctness/routines/${ROUTINE}.cc)
|
|
||||||
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
|
|
||||||
install(TARGETS test_${ROUTINE} DESTINATION bin)
|
|
||||||
endforeach()
|
endforeach()
|
||||||
foreach(ROUTINE ${ROUTINES_AXY})
|
foreach(ROUTINE ${LEVEL2_ROUTINES})
|
||||||
add_executable(test_${ROUTINE}
|
add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
|
||||||
$<TARGET_OBJECTS:test_correctness_common>
|
test/correctness/routines/level2/${ROUTINE}.cc)
|
||||||
$<TARGET_OBJECTS:test_correctness_axy>
|
|
||||||
test/correctness/routines/${ROUTINE}.cc)
|
|
||||||
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
|
|
||||||
install(TARGETS test_${ROUTINE} DESTINATION bin)
|
|
||||||
endforeach()
|
endforeach()
|
||||||
foreach(ROUTINE ${ROUTINES_ABC})
|
foreach(ROUTINE ${LEVEL3_ROUTINES})
|
||||||
add_executable(test_${ROUTINE}
|
add_executable(test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
|
||||||
$<TARGET_OBJECTS:test_correctness_common>
|
test/correctness/routines/level3/${ROUTINE}.cc)
|
||||||
$<TARGET_OBJECTS:test_correctness_abc>
|
endforeach()
|
||||||
test/correctness/routines/${ROUTINE}.cc)
|
foreach(ROUTINE ${ROUTINES})
|
||||||
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
|
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
|
||||||
install(TARGETS test_${ROUTINE} DESTINATION bin)
|
install(TARGETS test_${ROUTINE} DESTINATION bin)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
@ -203,10 +199,19 @@ if(TESTS)
|
||||||
add_library(test_performance_common OBJECT test/performance/client.cc)
|
add_library(test_performance_common OBJECT test/performance/client.cc)
|
||||||
|
|
||||||
# Compiles the performance-tests
|
# Compiles the performance-tests
|
||||||
set(TEST_PERF_COMM )
|
foreach(ROUTINE ${LEVEL1_ROUTINES})
|
||||||
foreach(ROUTINE ${ROUTINES})
|
|
||||||
add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
|
add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
|
||||||
test/performance/routines/${ROUTINE}.cc)
|
test/performance/routines/level1/${ROUTINE}.cc)
|
||||||
|
endforeach()
|
||||||
|
foreach(ROUTINE ${LEVEL2_ROUTINES})
|
||||||
|
add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
|
||||||
|
test/performance/routines/level2/${ROUTINE}.cc)
|
||||||
|
endforeach()
|
||||||
|
foreach(ROUTINE ${LEVEL3_ROUTINES})
|
||||||
|
add_executable(client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
|
||||||
|
test/performance/routines/level3/${ROUTINE}.cc)
|
||||||
|
endforeach()
|
||||||
|
foreach(ROUTINE ${ROUTINES})
|
||||||
target_link_libraries(client_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
|
target_link_libraries(client_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
|
||||||
install(TARGETS client_${ROUTINE} DESTINATION bin)
|
install(TARGETS client_${ROUTINE} DESTINATION bin)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
30
README.md
30
README.md
|
@ -4,7 +4,7 @@ CLBlast: The tuned OpenCL BLAS library
|
||||||
|
|
||||||
CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
|
CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
|
||||||
|
|
||||||
__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version supports only a minimal amount of routines (including `gemm` and `gemv`): others will be added in due time. It also lacks extensive tuning and testing on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
|
__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support all routines yet: others will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
|
||||||
|
|
||||||
|
|
||||||
Why CLBlast and not clBLAS or cuBLAS?
|
Why CLBlast and not clBLAS or cuBLAS?
|
||||||
|
@ -109,13 +109,13 @@ Performance remarks
|
||||||
|
|
||||||
The CLBlast library provides pre-tuned parameter-values for a number of OpenCL devices. If your device is not among these, then out-of-the-box performance might be poor. Even if the device is included performance might be poor in some cases: __the preview version is not thoroughly tested for performance yet__. See above under `Using the tuners` to find out how to tune for your device.
|
The CLBlast library provides pre-tuned parameter-values for a number of OpenCL devices. If your device is not among these, then out-of-the-box performance might be poor. Even if the device is included performance might be poor in some cases: __the preview version is not thoroughly tested for performance yet__. See above under `Using the tuners` to find out how to tune for your device.
|
||||||
|
|
||||||
The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm and Xsymm) show the strong points of CLBlast:
|
The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm, Xsymm, Xsyrk) show the strong points of CLBlast:
|
||||||
|
|
||||||
* The library reaches a high peak performance for large matrix sizes, in some cases a factor 2 more than clBLAS.
|
* The library reaches a high peak performance for large matrix sizes, in some cases a factor 2 more than clBLAS.
|
||||||
* The performance for non-power of 2 values (e.g. 1000) is roughly equal to power of 2 cases (e.g. 1024). This is not the case for clBLAS, which sometimes shows a drop of a factor 2.
|
* The performance for non-power of 2 values (e.g. 1000) is roughly equal to power of 2 cases (e.g. 1024). This is not the case for clBLAS, which sometimes shows a drop of a factor 2.
|
||||||
* The performance is also constant for different layouts and transpose options. Again, this is not the case for clBLAS.
|
* The performance is also constant for different layouts and transpose options. Again, this is not the case for clBLAS.
|
||||||
|
|
||||||
The graphs also show the current weak point of CLBlast: its performance for smaller matrix sizes is not too good. Furthermore, although the GEMM kernels perform well on AMD GPUs, the supporting copy and transpose kernel do not.
|
The graphs also show the current weak points of CLBlast: for small sizes the benefit is minimal or non-existent, and for some specific configurations clBLAS is still faster.
|
||||||
|
|
||||||
These graphs can be generated automatically on your own device. First, compile CLBlast with the tests enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `test/performance/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0:
|
These graphs can be generated automatically on your own device. First, compile CLBlast with the tests enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `test/performance/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0:
|
||||||
|
|
||||||
|
@ -124,7 +124,7 @@ These graphs can be generated automatically on your own device. First, compile C
|
||||||
Supported routines
|
Supported routines
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with `x` in the following tables:
|
CLBlast is in active development and currently does not support the full set of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
|
||||||
|
|
||||||
| Level-1 | S | D | C | Z | Notes |
|
| Level-1 | S | D | C | Z | Notes |
|
||||||
| ---------|---|---|---|---|---------|
|
| ---------|---|---|---|---|---------|
|
||||||
|
@ -135,7 +135,7 @@ CLBlast is in active development and currently does not support the full set of
|
||||||
| xSWAP | | | | | |
|
| xSWAP | | | | | |
|
||||||
| xSCAL | | | | | +CS +ZD |
|
| xSCAL | | | | | +CS +ZD |
|
||||||
| xCOPY | | | | | |
|
| xCOPY | | | | | |
|
||||||
| xAXPY |`x`|`x`|`x`|`x`| |
|
| xAXPY | ✔ | ✔ | ✔ | ✔ | |
|
||||||
| xDOT | | | - | - | +DS |
|
| xDOT | | | - | - | +DS |
|
||||||
| xDOTU | - | - | | | |
|
| xDOTU | - | - | | | |
|
||||||
| xDOTC | - | - | | | |
|
| xDOTC | - | - | | | |
|
||||||
|
@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of
|
||||||
|
|
||||||
| Level-2 | S | D | C | Z | Notes |
|
| Level-2 | S | D | C | Z | Notes |
|
||||||
| ---------|---|---|---|---|---------|
|
| ---------|---|---|---|---|---------|
|
||||||
| xGEMV |`x`|`x`|`x`|`x`| |
|
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
|
||||||
| xGBMV | | | | | |
|
| xGBMV | | | | | |
|
||||||
| xHEMV | - | - | | | |
|
| xHEMV | - | - | | | |
|
||||||
| xHBMV | - | - | | | |
|
| xHBMV | - | - | | | |
|
||||||
|
@ -175,14 +175,14 @@ CLBlast is in active development and currently does not support the full set of
|
||||||
|
|
||||||
| Level-3 | S | D | C | Z | Notes |
|
| Level-3 | S | D | C | Z | Notes |
|
||||||
| ---------|---|---|---|---|---------|
|
| ---------|---|---|---|---|---------|
|
||||||
| xGEMM |`x`|`x`|`x`|`x`| |
|
| xGEMM | ✔ | ✔ | ✔ | ✔ | |
|
||||||
| xSYMM |`x`|`x`|`x`|`x`| |
|
| xSYMM | ✔ | ✔ | ✔ | ✔ | |
|
||||||
| xHEMM | - | - | | | |
|
| xHEMM | - | - | ✔ | ✔ | |
|
||||||
| xSYRK | | | | | |
|
| xSYRK | ✔ | ✔ | ✔ | ✔ | |
|
||||||
| xHERK | - | - | | | |
|
| xHERK | - | - | ✔ | ✔ | |
|
||||||
| xSYR2K | | | | | |
|
| xSYR2K | ✔ | ✔ | ✔ | ✔ | |
|
||||||
| xHER2K | - | - | | | |
|
| xHER2K | - | - | ✔ | ✔ | |
|
||||||
| xTRMM | | | | | |
|
| xTRMM | ✔ | ✔ | ✔ | ✔ | |
|
||||||
| xTRSM | | | | | |
|
| xTRSM | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
@ -214,8 +214,6 @@ To-do list before release of version 1.0
|
||||||
- Improve host performance:
|
- Improve host performance:
|
||||||
* Allow initialization to pre-compile kernels and store to disk
|
* Allow initialization to pre-compile kernels and store to disk
|
||||||
- Improve device performance:
|
- Improve device performance:
|
||||||
* Enable 'mad()' for AMD devices
|
|
||||||
* Improve the performance of the copy and transpose kernels
|
|
||||||
* Tune for a wider range of devices
|
* Tune for a wider range of devices
|
||||||
* Allow users to define custom tuned parameters
|
* Allow users to define custom tuned parameters
|
||||||
- Improve the tuning
|
- Improve the tuning
|
||||||
|
|
Binary file not shown.
Binary file not shown.
BIN
doc/performance/GeForce_GTX480/SGEMV.pdf
Normal file
BIN
doc/performance/GeForce_GTX480/SGEMV.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
doc/performance/Iris/SSYRK.pdf
Normal file
BIN
doc/performance/Iris/SSYRK.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Radeon_HD7950/SAXPY.pdf
Normal file
BIN
doc/performance/Radeon_HD7950/SAXPY.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Radeon_HD7950/SGEMM.pdf
Normal file
BIN
doc/performance/Radeon_HD7950/SGEMM.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Radeon_HD7950/SGEMV.pdf
Normal file
BIN
doc/performance/Radeon_HD7950/SGEMV.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Radeon_HD7950/SSYMM.pdf
Normal file
BIN
doc/performance/Radeon_HD7950/SSYMM.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Radeon_HD7950/SSYRK.pdf
Normal file
BIN
doc/performance/Radeon_HD7950/SSYRK.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
doc/performance/Tesla_K40m/SGEMV.pdf
Normal file
BIN
doc/performance/Tesla_K40m/SGEMV.pdf
Normal file
Binary file not shown.
Binary file not shown.
BIN
doc/performance/Tesla_K40m/SSYRK.pdf
Normal file
BIN
doc/performance/Tesla_K40m/SSYRK.pdf
Normal file
Binary file not shown.
|
@ -75,6 +75,7 @@ enum class Layout { kRowMajor, kColMajor };
|
||||||
enum class Transpose { kNo, kYes, kConjugate };
|
enum class Transpose { kNo, kYes, kConjugate };
|
||||||
enum class Side { kLeft, kRight };
|
enum class Side { kLeft, kRight };
|
||||||
enum class Triangle { kUpper, kLower };
|
enum class Triangle { kUpper, kLower };
|
||||||
|
enum class Diagonal { kUnit, kNonUnit };
|
||||||
|
|
||||||
// Precision scoped enum (values in bits)
|
// Precision scoped enum (values in bits)
|
||||||
enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
|
enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
|
||||||
|
@ -95,7 +96,7 @@ StatusCode Axpy(const size_t n, const T alpha,
|
||||||
|
|
||||||
// Templated-precision generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
|
// Templated-precision generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Gemv(const Layout layout, const Transpose transpose_a,
|
StatusCode Gemv(const Layout layout, const Transpose a_transpose,
|
||||||
const size_t m, const size_t n,
|
const size_t m, const size_t n,
|
||||||
const T alpha,
|
const T alpha,
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
@ -107,9 +108,9 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
// BLAS level-3 (matrix-matrix) routines
|
// BLAS level-3 (matrix-matrix) routines
|
||||||
|
|
||||||
// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM
|
// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
|
StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
||||||
const size_t m, const size_t n, const size_t k,
|
const size_t m, const size_t n, const size_t k,
|
||||||
const T alpha,
|
const T alpha,
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
@ -118,7 +119,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
|
||||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event);
|
||||||
|
|
||||||
// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM
|
// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
const size_t m, const size_t n,
|
const size_t m, const size_t n,
|
||||||
|
@ -129,6 +130,81 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event);
|
||||||
|
|
||||||
|
// Templated-precision hermitian matrix-matrix multiplication: CHEMM/ZHEMM
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
const T beta,
|
||||||
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event);
|
||||||
|
|
||||||
|
// Templated-precision rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const T beta,
|
||||||
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event);
|
||||||
|
|
||||||
|
// Templated-precision rank-K update of a hermitian matrix: CHERK/ZHERK
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const T beta,
|
||||||
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event);
|
||||||
|
|
||||||
|
// Templated-precision rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
const T beta,
|
||||||
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event);
|
||||||
|
|
||||||
|
// Templated-precision rank-2K update of a hermitian matrix: CHER2K/ZHER2K
|
||||||
|
template <typename T, typename U>
|
||||||
|
StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
const U beta,
|
||||||
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event);
|
||||||
|
|
||||||
|
// Templated-precision triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
const Transpose a_transpose, const Diagonal diagonal,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event);
|
||||||
|
|
||||||
|
// Templated-precision matrix equation solver: STRSM/DTRSM/CTRSM/ZTRSM
|
||||||
|
/*
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
const Transpose a_transpose, const Diagonal diagonal,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event);
|
||||||
|
*/
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
} // namespace clblast
|
} // namespace clblast
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::CopySingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
|
{ "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
|
{ "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
|
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::PadSingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::PadDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::PadTraSingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
|
{ "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::PadTraDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
|
{ "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::PadTraComplexSingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
|
{ "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadTraComplexDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
|
{ "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -18,24 +18,24 @@ const Database::DatabaseEntry Database::TraSingle = {
|
||||||
"Transpose", Precision::kSingle, {
|
"Transpose", Precision::kSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
|
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
|
||||||
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
|
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
|
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
|
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
|
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "Intel", {
|
CL_DEVICE_TYPE_GPU, "Intel", {
|
||||||
{ "Iris", { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0} } },
|
{ "Iris", { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
CL_DEVICE_TYPE_ALL, kDefault, {
|
CL_DEVICE_TYPE_ALL, kDefault, {
|
||||||
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
|
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -47,14 +47,14 @@ const Database::DatabaseEntry Database::TraDouble = {
|
||||||
"Transpose", Precision::kDouble, {
|
"Transpose", Precision::kDouble, {
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
|
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
|
||||||
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
|
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
|
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
|
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",8}, {"TRA_PAD",0} } },
|
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
|
@ -63,7 +63,7 @@ const Database::DatabaseEntry Database::TraDouble = {
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
CL_DEVICE_TYPE_ALL, kDefault, {
|
CL_DEVICE_TYPE_ALL, kDefault, {
|
||||||
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
|
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -75,24 +75,24 @@ const Database::DatabaseEntry Database::TraComplexSingle = {
|
||||||
"Transpose", Precision::kComplexSingle, {
|
"Transpose", Precision::kComplexSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
|
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
|
||||||
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
|
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
|
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
||||||
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
|
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1} } },
|
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "Intel", {
|
CL_DEVICE_TYPE_GPU, "Intel", {
|
||||||
{ "Iris", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
|
{ "Iris", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
CL_DEVICE_TYPE_ALL, kDefault, {
|
CL_DEVICE_TYPE_ALL, kDefault, {
|
||||||
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
|
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -104,14 +104,14 @@ const Database::DatabaseEntry Database::TraComplexDouble = {
|
||||||
"Transpose", Precision::kComplexDouble, {
|
"Transpose", Precision::kComplexDouble, {
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
|
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
|
||||||
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
|
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
|
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1} } },
|
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
|
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
|
@ -120,7 +120,7 @@ const Database::DatabaseEntry Database::TraComplexDouble = {
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
CL_DEVICE_TYPE_ALL, kDefault, {
|
CL_DEVICE_TYPE_ALL, kDefault, {
|
||||||
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0} } },
|
{ kDefault, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",2} } },
|
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
|
{ "Tahiti", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
|
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -109,7 +109,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
|
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -25,8 +25,8 @@ const Database::DatabaseEntry Database::XgemmSingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",8}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
|
{ "Tahiti", { {"MWG",128}, {"NWG",128}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",8}, {"KWI",2}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
|
@ -55,7 +55,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
|
{ "Tahiti", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -84,13 +84,13 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
|
{ "Tahiti", { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "Intel", {
|
CL_DEVICE_TYPE_GPU, "Intel", {
|
||||||
{ "Iris", { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
|
{ "Iris", { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
|
@ -114,7 +114,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
|
{ "Tahiti", { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -24,7 +24,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -109,7 +109,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
CL_DEVICE_TYPE_GPU, "AMD", {
|
CL_DEVICE_TYPE_GPU, "Advanced Micro Devices, Inc.", {
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -34,20 +34,14 @@ class Routine {
|
||||||
Program program;
|
Program program;
|
||||||
std::string device_name;
|
std::string device_name;
|
||||||
Precision precision;
|
Precision precision;
|
||||||
std::vector<std::string> routines;
|
std::string routine_name_;
|
||||||
|
|
||||||
// Finds out whether the properties match
|
// Finds out whether the properties match
|
||||||
bool MatchInCache(const std::string &ref_name, const Precision &ref_precision,
|
bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
|
||||||
const std::vector<std::string> &ref_routines) {
|
const std::string &ref_routine) {
|
||||||
auto ref_size = ref_routines.size();
|
return (device_name == ref_device &&
|
||||||
if (device_name == ref_name && precision == ref_precision && routines.size() == ref_size) {
|
precision == ref_precision &&
|
||||||
auto found_match = true;
|
routine_name_ == ref_routine);
|
||||||
for (auto i=size_t{0}; i<ref_size; ++i) {
|
|
||||||
if (routines[i] != ref_routines[i]) { found_match = false; }
|
|
||||||
}
|
|
||||||
return found_match;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -58,11 +52,11 @@ class Routine {
|
||||||
static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
|
static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
|
||||||
|
|
||||||
// Base class constructor
|
// Base class constructor
|
||||||
explicit Routine(CommandQueue &queue, Event &event,
|
explicit Routine(CommandQueue &queue, Event &event, const std::string &name,
|
||||||
const std::vector<std::string> &routines, const Precision precision);
|
const std::vector<std::string> &routines, const Precision precision);
|
||||||
|
|
||||||
// Set-up phase of the kernel
|
// Set-up phase of the kernel
|
||||||
StatusCode SetUp(const std::string &routine_source);
|
StatusCode SetUp();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
|
@ -84,15 +78,18 @@ class Routine {
|
||||||
StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
|
StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
|
||||||
const size_t inc, const size_t data_size);
|
const size_t inc, const size_t data_size);
|
||||||
|
|
||||||
// Copies/transposes a matrix and padds/unpads it
|
// Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
|
||||||
|
// to symmetric and triangular matrices through optional arguments.
|
||||||
StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
|
StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
|
||||||
const size_t src_ld, const size_t src_offset,
|
const size_t src_ld, const size_t src_offset,
|
||||||
const Buffer &src,
|
const Buffer &src,
|
||||||
const size_t dest_one, const size_t dest_two,
|
const size_t dest_one, const size_t dest_two,
|
||||||
const size_t dest_ld, const size_t dest_offset,
|
const size_t dest_ld, const size_t dest_offset,
|
||||||
const Buffer &dest,
|
const Buffer &dest,
|
||||||
|
const Program &program, const bool do_pad,
|
||||||
const bool do_transpose, const bool do_conjugate,
|
const bool do_transpose, const bool do_conjugate,
|
||||||
const bool pad, const Program &program);
|
const bool upper = false, const bool lower = false,
|
||||||
|
const bool diagonal_imag_zero = false);
|
||||||
|
|
||||||
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
|
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
|
||||||
// The first assumes that the program is available in the cache and will throw an exception
|
// The first assumes that the program is available in the cache and will throw an exception
|
||||||
|
@ -104,6 +101,10 @@ class Routine {
|
||||||
// a derived class.
|
// a derived class.
|
||||||
const Precision precision_;
|
const Precision precision_;
|
||||||
|
|
||||||
|
// The routine's name and its kernel-source in string form
|
||||||
|
const std::string routine_name_;
|
||||||
|
std::string source_string_;
|
||||||
|
|
||||||
// The OpenCL objects, accessible only from derived classes
|
// The OpenCL objects, accessible only from derived classes
|
||||||
CommandQueue queue_;
|
CommandQueue queue_;
|
||||||
Event event_;
|
Event event_;
|
||||||
|
@ -118,7 +119,6 @@ class Routine {
|
||||||
|
|
||||||
// Connection to the database for all the device-specific parameters
|
// Connection to the database for all the device-specific parameters
|
||||||
const Database db_;
|
const Database db_;
|
||||||
const std::vector<std::string> routines_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
58
include/internal/routines/level3/xhemm.h
Normal file
58
include/internal/routines/level3/xhemm.h
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xhemm routine. It is based on the generalized matrix multiplication
|
||||||
|
// routine (Xgemm). The implementation is very similar to the Xsymm routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XHEMM_H_
|
||||||
|
#define CLBLAST_ROUTINES_XHEMM_H_
|
||||||
|
|
||||||
|
#include "internal/routines/level3/xgemm.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xhemm: public Xgemm<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses several variables from the Routine class
|
||||||
|
using Routine::db_;
|
||||||
|
using Routine::context_;
|
||||||
|
|
||||||
|
// Uses several helper functions from the Routine class
|
||||||
|
using Routine::RunKernel;
|
||||||
|
using Routine::ErrorIn;
|
||||||
|
using Routine::TestMatrixA;
|
||||||
|
using Routine::GetProgramFromCache;
|
||||||
|
|
||||||
|
// Uses the regular Xgemm routine
|
||||||
|
using Xgemm<T>::DoGemm;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xhemm(CommandQueue &queue, Event &event);
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
const T beta,
|
||||||
|
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XHEMM_H_
|
||||||
|
#endif
|
48
include/internal/routines/level3/xher2k.h
Normal file
48
include/internal/routines/level3/xher2k.h
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xher2k routine. The precision is implemented using the template argument
|
||||||
|
// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
|
||||||
|
// Xsyr2k routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XHER2K_H_
|
||||||
|
#define CLBLAST_ROUTINES_XHER2K_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T, typename U>
|
||||||
|
class Xher2k: public Routine {
|
||||||
|
public:
|
||||||
|
Xher2k(CommandQueue &queue, Event &event);
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
const U beta,
|
||||||
|
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Static variable to get the precision
|
||||||
|
const static Precision precision_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XHER2K_H_
|
||||||
|
#endif
|
47
include/internal/routines/level3/xherk.h
Normal file
47
include/internal/routines/level3/xherk.h
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xherk routine. The precision is implemented using the template argument
|
||||||
|
// 'T', whereas the alpha/beta arguments are of type 'U'. The implementation is very similar to the
|
||||||
|
// Xsyrk routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XHERK_H_
|
||||||
|
#define CLBLAST_ROUTINES_XHERK_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T, typename U>
|
||||||
|
class Xherk: public Routine {
|
||||||
|
public:
|
||||||
|
Xherk(CommandQueue &queue, Event &event);
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const U alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const U beta,
|
||||||
|
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Static variable to get the precision
|
||||||
|
const static Precision precision_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XHERK_H_
|
||||||
|
#endif
|
|
@ -17,7 +17,7 @@
|
||||||
#ifndef CLBLAST_ROUTINES_XSYMM_H_
|
#ifndef CLBLAST_ROUTINES_XSYMM_H_
|
||||||
#define CLBLAST_ROUTINES_XSYMM_H_
|
#define CLBLAST_ROUTINES_XSYMM_H_
|
||||||
|
|
||||||
#include "internal/routines/xgemm.h"
|
#include "internal/routines/level3/xgemm.h"
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
48
include/internal/routines/level3/xsyr2k.h
Normal file
48
include/internal/routines/level3/xsyr2k.h
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xsyr2k routine. The precision is implemented using a template argument.
|
||||||
|
// The implementation is very similar to Xsyrk (see header for details), except for the fact that
|
||||||
|
// the main XgemmUpper/XgemmLower kernel is called twice: C = AB^T + C and C = BA^T + C.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XSYR2K_H_
|
||||||
|
#define CLBLAST_ROUTINES_XSYR2K_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xsyr2k: public Routine {
|
||||||
|
public:
|
||||||
|
Xsyr2k(CommandQueue &queue, Event &event);
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
const T beta,
|
||||||
|
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Static variable to get the precision
|
||||||
|
const static Precision precision_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XSYR2K_H_
|
||||||
|
#endif
|
49
include/internal/routines/level3/xsyrk.h
Normal file
49
include/internal/routines/level3/xsyrk.h
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xsyrk routine. The precision is implemented using a template argument.
|
||||||
|
// The implementation is based on the regular Xgemm routine and kernel, but with two main changes:
|
||||||
|
// 1) The final unpad(transpose) kernel updates only the upper/lower triangular part.
|
||||||
|
// 2) The main Xgemm kernel masks workgroups not contributing to usefull data. This is only for
|
||||||
|
// performance reasons, as the actual masking is done later (see the first point).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XSYRK_H_
|
||||||
|
#define CLBLAST_ROUTINES_XSYRK_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xsyrk: public Routine {
|
||||||
|
public:
|
||||||
|
Xsyrk(CommandQueue &queue, Event &event);
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const T beta,
|
||||||
|
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Static variable to get the precision
|
||||||
|
const static Precision precision_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XSYRK_H_
|
||||||
|
#endif
|
58
include/internal/routines/level3/xtrmm.h
Normal file
58
include/internal/routines/level3/xtrmm.h
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xtrmm routine. The implementation is based on first transforming the
|
||||||
|
// upper/lower unit/non-unit triangular matrix into a regular matrix and then calling the GEMM
|
||||||
|
// routine. Therefore, this class inherits from the Xgemm class.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XTRMM_H_
|
||||||
|
#define CLBLAST_ROUTINES_XTRMM_H_
|
||||||
|
|
||||||
|
#include "internal/routines/level3/xgemm.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xtrmm: public Xgemm<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses several variables from the Routine class
|
||||||
|
using Routine::db_;
|
||||||
|
using Routine::context_;
|
||||||
|
|
||||||
|
// Uses several helper functions from the Routine class
|
||||||
|
using Routine::RunKernel;
|
||||||
|
using Routine::ErrorIn;
|
||||||
|
using Routine::TestMatrixA;
|
||||||
|
using Routine::GetProgramFromCache;
|
||||||
|
|
||||||
|
// Uses the regular Xgemm routine
|
||||||
|
using Xgemm<T>::DoGemm;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xtrmm(CommandQueue &queue, Event &event);
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
const Transpose a_transpose, const Diagonal diagonal,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld);
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XTRMM_H_
|
||||||
|
#endif
|
|
@ -46,6 +46,7 @@ constexpr auto kArgATransp = "transA";
|
||||||
constexpr auto kArgBTransp = "transB";
|
constexpr auto kArgBTransp = "transB";
|
||||||
constexpr auto kArgSide = "side";
|
constexpr auto kArgSide = "side";
|
||||||
constexpr auto kArgTriangle = "triangle";
|
constexpr auto kArgTriangle = "triangle";
|
||||||
|
constexpr auto kArgDiagonal = "diagonal";
|
||||||
constexpr auto kArgXInc = "incx";
|
constexpr auto kArgXInc = "incx";
|
||||||
constexpr auto kArgYInc = "incy";
|
constexpr auto kArgYInc = "incy";
|
||||||
constexpr auto kArgXOffset = "offx";
|
constexpr auto kArgXOffset = "offx";
|
||||||
|
@ -93,6 +94,7 @@ struct Arguments {
|
||||||
Transpose b_transpose = Transpose::kNo;
|
Transpose b_transpose = Transpose::kNo;
|
||||||
Side side = Side::kLeft;
|
Side side = Side::kLeft;
|
||||||
Triangle triangle = Triangle::kUpper;
|
Triangle triangle = Triangle::kUpper;
|
||||||
|
Diagonal diagonal = Diagonal::kUnit;
|
||||||
size_t x_inc = 1;
|
size_t x_inc = 1;
|
||||||
size_t y_inc = 1;
|
size_t y_inc = 1;
|
||||||
size_t x_offset = 0;
|
size_t x_offset = 0;
|
||||||
|
@ -105,6 +107,11 @@ struct Arguments {
|
||||||
size_t c_offset = 0;
|
size_t c_offset = 0;
|
||||||
T alpha = T{1.0};
|
T alpha = T{1.0};
|
||||||
T beta = T{1.0};
|
T beta = T{1.0};
|
||||||
|
size_t x_size = 1;
|
||||||
|
size_t y_size = 1;
|
||||||
|
size_t a_size = 1;
|
||||||
|
size_t b_size = 1;
|
||||||
|
size_t c_size = 1;
|
||||||
// Tuner-specific arguments
|
// Tuner-specific arguments
|
||||||
double fraction = 1.0;
|
double fraction = 1.0;
|
||||||
// Client-specific arguments
|
// Client-specific arguments
|
||||||
|
@ -123,6 +130,15 @@ struct Arguments {
|
||||||
bool no_abbrv = false;
|
bool no_abbrv = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Structure containing all possible buffers for test clients
|
||||||
|
struct Buffers {
|
||||||
|
Buffer x_vec;
|
||||||
|
Buffer y_vec;
|
||||||
|
Buffer a_mat;
|
||||||
|
Buffer b_mat;
|
||||||
|
Buffer c_mat;
|
||||||
|
};
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
|
// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
|
||||||
|
|
362
src/clblast.cc
362
src/clblast.cc
|
@ -18,14 +18,20 @@
|
||||||
#include "clblast.h"
|
#include "clblast.h"
|
||||||
|
|
||||||
// BLAS level-1 includes
|
// BLAS level-1 includes
|
||||||
#include "internal/routines/xaxpy.h"
|
#include "internal/routines/level1/xaxpy.h"
|
||||||
|
|
||||||
// BLAS level-2 includes
|
// BLAS level-2 includes
|
||||||
#include "internal/routines/xgemv.h"
|
#include "internal/routines/level2/xgemv.h"
|
||||||
|
|
||||||
// BLAS level-3 includes
|
// BLAS level-3 includes
|
||||||
#include "internal/routines/xgemm.h"
|
#include "internal/routines/level3/xgemm.h"
|
||||||
#include "internal/routines/xsymm.h"
|
#include "internal/routines/level3/xsymm.h"
|
||||||
|
#include "internal/routines/level3/xhemm.h"
|
||||||
|
#include "internal/routines/level3/xsyrk.h"
|
||||||
|
#include "internal/routines/level3/xherk.h"
|
||||||
|
#include "internal/routines/level3/xsyr2k.h"
|
||||||
|
#include "internal/routines/level3/xher2k.h"
|
||||||
|
#include "internal/routines/level3/xtrmm.h"
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -41,10 +47,8 @@ StatusCode Axpy(const size_t n, const T alpha,
|
||||||
auto event_cpp = Event(*event);
|
auto event_cpp = Event(*event);
|
||||||
auto routine = Xaxpy<T>(queue_cpp, event_cpp);
|
auto routine = Xaxpy<T>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
// Loads the kernel source-code as an include (C++11 raw string literal)
|
// Compiles the routine's device kernels
|
||||||
std::string kernel_source =
|
auto status = routine.SetUp();
|
||||||
#include "kernels/xaxpy.opencl"
|
|
||||||
auto status = routine.SetUp(kernel_source);
|
|
||||||
if (status != StatusCode::kSuccess) { return status; }
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
// Runs the routine
|
// Runs the routine
|
||||||
|
@ -74,7 +78,7 @@ template StatusCode Axpy<double2>(const size_t, const double2,
|
||||||
|
|
||||||
// GEMV
|
// GEMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Gemv(const Layout layout, const Transpose transpose_a,
|
StatusCode Gemv(const Layout layout, const Transpose a_transpose,
|
||||||
const size_t m, const size_t n, const T alpha,
|
const size_t m, const size_t n, const T alpha,
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta,
|
||||||
|
@ -85,14 +89,12 @@ StatusCode Gemv(const Layout layout, const Transpose transpose_a,
|
||||||
auto event_cpp = Event(*event);
|
auto event_cpp = Event(*event);
|
||||||
auto routine = Xgemv<T>(queue_cpp, event_cpp);
|
auto routine = Xgemv<T>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
// Loads the kernel source-code as an include (C++11 raw string literal)
|
// Compiles the routine's device kernels
|
||||||
std::string kernel_source =
|
auto status = routine.SetUp();
|
||||||
#include "kernels/xgemv.opencl"
|
|
||||||
auto status = routine.SetUp(kernel_source);
|
|
||||||
if (status != StatusCode::kSuccess) { return status; }
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
// Runs the routine
|
// Runs the routine
|
||||||
return routine.DoGemv(layout, transpose_a, m, n, alpha,
|
return routine.DoGemv(layout, a_transpose, m, n, alpha,
|
||||||
Buffer(a_buffer), a_offset, a_ld,
|
Buffer(a_buffer), a_offset, a_ld,
|
||||||
Buffer(x_buffer), x_offset, x_inc, beta,
|
Buffer(x_buffer), x_offset, x_inc, beta,
|
||||||
Buffer(y_buffer), y_offset, y_inc);
|
Buffer(y_buffer), y_offset, y_inc);
|
||||||
|
@ -127,7 +129,7 @@ template StatusCode Gemv<double2>(const Layout, const Transpose,
|
||||||
|
|
||||||
// GEMM
|
// GEMM
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
|
StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
||||||
const size_t m, const size_t n, const size_t k, const T alpha,
|
const size_t m, const size_t n, const size_t k, const T alpha,
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
|
||||||
|
@ -137,23 +139,12 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
|
||||||
auto event_cpp = Event(*event);
|
auto event_cpp = Event(*event);
|
||||||
auto routine = Xgemm<T>(queue_cpp, event_cpp);
|
auto routine = Xgemm<T>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
// Loads the kernel source-code as an include (C++11 raw string literal)
|
// Compiles the routine's device kernels
|
||||||
std::string common_source1 =
|
auto status = routine.SetUp();
|
||||||
#include "kernels/copy.opencl"
|
|
||||||
std::string common_source2 =
|
|
||||||
#include "kernels/pad.opencl"
|
|
||||||
std::string common_source3 =
|
|
||||||
#include "kernels/transpose.opencl"
|
|
||||||
std::string common_source4 =
|
|
||||||
#include "kernels/padtranspose.opencl"
|
|
||||||
std::string kernel_source =
|
|
||||||
#include "kernels/xgemm.opencl"
|
|
||||||
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
|
|
||||||
kernel_source);
|
|
||||||
if (status != StatusCode::kSuccess) { return status; }
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
// Runs the routine
|
// Runs the routine
|
||||||
return routine.DoGemm(layout, transpose_a, transpose_b, m, n, k, alpha,
|
return routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha,
|
||||||
Buffer(a_buffer), a_offset, a_ld,
|
Buffer(a_buffer), a_offset, a_ld,
|
||||||
Buffer(b_buffer), b_offset, b_ld, beta,
|
Buffer(b_buffer), b_offset, b_ld, beta,
|
||||||
Buffer(c_buffer), c_offset, c_ld);
|
Buffer(c_buffer), c_offset, c_ld);
|
||||||
|
@ -197,19 +188,8 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
auto event_cpp = Event(*event);
|
auto event_cpp = Event(*event);
|
||||||
auto routine = Xsymm<T>(queue_cpp, event_cpp);
|
auto routine = Xsymm<T>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
// Loads the kernel source-code as an include (C++11 raw string literal)
|
// Compiles the routine's device kernels
|
||||||
std::string common_source1 =
|
auto status = routine.SetUp();
|
||||||
#include "kernels/copy.opencl"
|
|
||||||
std::string common_source2 =
|
|
||||||
#include "kernels/pad.opencl"
|
|
||||||
std::string common_source3 =
|
|
||||||
#include "kernels/transpose.opencl"
|
|
||||||
std::string common_source4 =
|
|
||||||
#include "kernels/padtranspose.opencl"
|
|
||||||
std::string kernel_source =
|
|
||||||
#include "kernels/xgemm.opencl"
|
|
||||||
auto status = routine.SetUp(common_source1 + common_source2 + common_source3 + common_source4 +
|
|
||||||
kernel_source);
|
|
||||||
if (status != StatusCode::kSuccess) { return status; }
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
// Runs the routine
|
// Runs the routine
|
||||||
|
@ -244,4 +224,302 @@ template StatusCode Symm<double2>(const Layout, const Side, const Triangle,
|
||||||
cl_command_queue*, cl_event*);
|
cl_command_queue*, cl_event*);
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
// HEMM
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
const size_t m, const size_t n, const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
|
||||||
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto queue_cpp = CommandQueue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xhemm<T>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
|
// Compiles the routine's device kernels
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
|
// Runs the routine
|
||||||
|
return routine.DoHemm(layout, side, triangle, m, n, alpha,
|
||||||
|
Buffer(a_buffer), a_offset, a_ld,
|
||||||
|
Buffer(b_buffer), b_offset, b_ld, beta,
|
||||||
|
Buffer(c_buffer), c_offset, c_ld);
|
||||||
|
}
|
||||||
|
template StatusCode Hemm<float2>(const Layout, const Side, const Triangle,
|
||||||
|
const size_t, const size_t, const float2,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
const cl_mem, const size_t, const size_t, const float2,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Hemm<double2>(const Layout, const Side, const Triangle,
|
||||||
|
const size_t, const size_t, const double2,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
const cl_mem, const size_t, const size_t, const double2,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// SYRK
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||||
|
const size_t n, const size_t k, const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
|
||||||
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto queue_cpp = CommandQueue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xsyrk<T>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
|
// Compiles the routine's device kernels
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
|
// Runs the routine
|
||||||
|
return routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha,
|
||||||
|
Buffer(a_buffer), a_offset, a_ld, beta,
|
||||||
|
Buffer(c_buffer), c_offset, c_ld);
|
||||||
|
}
|
||||||
|
template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const float,
|
||||||
|
const cl_mem, const size_t, const size_t, const float,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Syrk<double>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const double,
|
||||||
|
const cl_mem, const size_t, const size_t, const double,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Syrk<float2>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const float2,
|
||||||
|
const cl_mem, const size_t, const size_t, const float2,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Syrk<double2>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const double2,
|
||||||
|
const cl_mem, const size_t, const size_t, const double2,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// HERK
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||||
|
const size_t n, const size_t k, const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
|
||||||
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto queue_cpp = CommandQueue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xherk<std::complex<T>,T>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
|
// Compiles the routine's device kernels
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
|
// Runs the routine
|
||||||
|
return routine.DoHerk(layout, triangle, a_transpose, n, k, alpha,
|
||||||
|
Buffer(a_buffer), a_offset, a_ld, beta,
|
||||||
|
Buffer(c_buffer), c_offset, c_ld);
|
||||||
|
}
|
||||||
|
template StatusCode Herk<float>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const float,
|
||||||
|
const cl_mem, const size_t, const size_t, const float,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Herk<double>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const double,
|
||||||
|
const cl_mem, const size_t, const size_t, const double,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// SYR2K
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||||
|
const size_t n, const size_t k, const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
|
||||||
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto queue_cpp = CommandQueue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xsyr2k<T>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
|
// Compiles the routine's device kernels
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
|
// Runs the routine
|
||||||
|
return routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha,
|
||||||
|
Buffer(a_buffer), a_offset, a_ld,
|
||||||
|
Buffer(b_buffer), b_offset, b_ld, beta,
|
||||||
|
Buffer(c_buffer), c_offset, c_ld);
|
||||||
|
}
|
||||||
|
template StatusCode Syr2k<float>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const float,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
const cl_mem, const size_t, const size_t, const float,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Syr2k<double>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const double,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
const cl_mem, const size_t, const size_t, const double,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Syr2k<float2>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const float2,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
const cl_mem, const size_t, const size_t, const float2,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Syr2k<double2>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const double2,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
const cl_mem, const size_t, const size_t, const double2,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// SYR2K
|
||||||
|
template <typename T, typename U>
|
||||||
|
StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||||
|
const size_t n, const size_t k, const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta,
|
||||||
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto queue_cpp = CommandQueue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xher2k<T,U>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
|
// Compiles the routine's device kernels
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
|
// Runs the routine
|
||||||
|
return routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha,
|
||||||
|
Buffer(a_buffer), a_offset, a_ld,
|
||||||
|
Buffer(b_buffer), b_offset, b_ld, beta,
|
||||||
|
Buffer(c_buffer), c_offset, c_ld);
|
||||||
|
}
|
||||||
|
template StatusCode Her2k<float2,float>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const float2,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
const cl_mem, const size_t, const size_t, const float,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Her2k<double2,double>(const Layout, const Triangle, const Transpose,
|
||||||
|
const size_t, const size_t, const double2,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
const cl_mem, const size_t, const size_t, const double,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// TRMM
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
const Transpose a_transpose, const Diagonal diagonal,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto queue_cpp = CommandQueue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xtrmm<T>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
|
// Compiles the routine's device kernels
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
|
// Runs the routine
|
||||||
|
return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
|
||||||
|
Buffer(a_buffer), a_offset, a_ld,
|
||||||
|
Buffer(b_buffer), b_offset, b_ld);
|
||||||
|
}
|
||||||
|
template StatusCode Trmm<float>(const Layout, const Side, const Triangle,
|
||||||
|
const Transpose, const Diagonal,
|
||||||
|
const size_t, const size_t, const float,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Trmm<double>(const Layout, const Side, const Triangle,
|
||||||
|
const Transpose, const Diagonal,
|
||||||
|
const size_t, const size_t, const double,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Trmm<float2>(const Layout, const Side, const Triangle,
|
||||||
|
const Transpose, const Diagonal,
|
||||||
|
const size_t, const size_t, const float2,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Trmm<double2>(const Layout, const Side, const Triangle,
|
||||||
|
const Transpose, const Diagonal,
|
||||||
|
const size_t, const size_t, const double2,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// TRSM
|
||||||
|
/*
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
const Transpose a_transpose, const Diagonal diagonal,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto queue_cpp = CommandQueue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xtrsm<T>(queue_cpp, event_cpp);
|
||||||
|
|
||||||
|
// Compiles the routine's device kernels
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
|
||||||
|
// Runs the routine
|
||||||
|
return routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
|
||||||
|
Buffer(a_buffer), a_offset, a_ld,
|
||||||
|
Buffer(b_buffer), b_offset, b_ld);
|
||||||
|
}
|
||||||
|
template StatusCode Trsm<float>(const Layout, const Side, const Triangle,
|
||||||
|
const Transpose, const Diagonal,
|
||||||
|
const size_t, const size_t, const float,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Trsm<double>(const Layout, const Side, const Triangle,
|
||||||
|
const Transpose, const Diagonal,
|
||||||
|
const size_t, const size_t, const double,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Trsm<float2>(const Layout, const Side, const Triangle,
|
||||||
|
const Transpose, const Diagonal,
|
||||||
|
const size_t, const size_t, const float2,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
template StatusCode Trsm<double2>(const Layout, const Side, const Triangle,
|
||||||
|
const Transpose, const Diagonal,
|
||||||
|
const size_t, const size_t, const double2,
|
||||||
|
const cl_mem, const size_t, const size_t,
|
||||||
|
cl_mem, const size_t, const size_t,
|
||||||
|
cl_command_queue*, cl_event*);
|
||||||
|
*/
|
||||||
|
// =================================================================================================
|
||||||
} // namespace clblast
|
} // namespace clblast
|
||||||
|
|
|
@ -39,6 +39,7 @@ R"(
|
||||||
typedef float8 real8;
|
typedef float8 real8;
|
||||||
typedef float16 real16;
|
typedef float16 real16;
|
||||||
#define ZERO 0.0f
|
#define ZERO 0.0f
|
||||||
|
#define ONE 1.0f
|
||||||
|
|
||||||
// Double-precision
|
// Double-precision
|
||||||
#elif PRECISION == 64
|
#elif PRECISION == 64
|
||||||
|
@ -48,6 +49,7 @@ R"(
|
||||||
typedef double8 real8;
|
typedef double8 real8;
|
||||||
typedef double16 real16;
|
typedef double16 real16;
|
||||||
#define ZERO 0.0
|
#define ZERO 0.0
|
||||||
|
#define ONE 1.0
|
||||||
|
|
||||||
// Complex single-precision
|
// Complex single-precision
|
||||||
#elif PRECISION == 3232
|
#elif PRECISION == 3232
|
||||||
|
@ -61,6 +63,7 @@ R"(
|
||||||
real s8; real s9; real sA; real sB;
|
real s8; real s9; real sA; real sB;
|
||||||
real sC; real sD; real sE; real sF;} real16;
|
real sC; real sD; real sE; real sF;} real16;
|
||||||
#define ZERO 0.0f
|
#define ZERO 0.0f
|
||||||
|
#define ONE 1.0f
|
||||||
|
|
||||||
// Complex Double-precision
|
// Complex Double-precision
|
||||||
#elif PRECISION == 6464
|
#elif PRECISION == 6464
|
||||||
|
@ -74,12 +77,16 @@ R"(
|
||||||
real s8; real s9; real sA; real sB;
|
real s8; real s9; real sA; real sB;
|
||||||
real sC; real sD; real sE; real sF;} real16;
|
real sC; real sD; real sE; real sF;} real16;
|
||||||
#define ZERO 0.0
|
#define ZERO 0.0
|
||||||
|
#define ONE 1.0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction
|
// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
|
||||||
#define USE_CL_MAD 0
|
// devices, this is enabled (see src/routine.cc).
|
||||||
|
#ifndef USE_CL_MAD
|
||||||
|
#define USE_CL_MAD 0
|
||||||
|
#endif
|
||||||
|
|
||||||
// Sets a variable to zero
|
// Sets a variable to zero
|
||||||
#if PRECISION == 3232 || PRECISION == 6464
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
|
@ -88,6 +95,20 @@ R"(
|
||||||
#define SetToZero(a) a = ZERO
|
#define SetToZero(a) a = ZERO
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Sets a variable to zero (only the imaginary part)
|
||||||
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
|
#define ImagToZero(a) a.y = ZERO
|
||||||
|
#else
|
||||||
|
#define ImagToZero(a)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Sets a variable to one
|
||||||
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
|
#define SetToOne(a) a.x = ONE; a.y = ZERO
|
||||||
|
#else
|
||||||
|
#define SetToOne(a) a = ONE
|
||||||
|
#endif
|
||||||
|
|
||||||
// Multiply two complex variables (used in the define below)
|
// Multiply two complex variables (used in the define below)
|
||||||
#if PRECISION == 3232 || PRECISION == 6464
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
#define MulReal(a, b) a.x*b.x - a.y*b.y
|
#define MulReal(a, b) a.x*b.x - a.y*b.y
|
||||||
|
@ -122,6 +143,6 @@ R"(
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// End of the C++11 raw string literal
|
// End of the C++11 raw string literal
|
||||||
)";
|
)"
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -68,6 +68,6 @@ __kernel void CopyMatrix(const int ld,
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// End of the C++11 raw string literal
|
// End of the C++11 raw string literal
|
||||||
)";
|
)"
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -86,7 +86,9 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
|
||||||
__global const real* restrict src,
|
__global const real* restrict src,
|
||||||
const int dest_one, const int dest_two,
|
const int dest_one, const int dest_two,
|
||||||
const int dest_ld, const int dest_offset,
|
const int dest_ld, const int dest_offset,
|
||||||
__global real* dest) {
|
__global real* dest,
|
||||||
|
const int upper, const int lower,
|
||||||
|
const int diagonal_imag_zero) {
|
||||||
|
|
||||||
// Loops over the work per thread in both dimensions
|
// Loops over the work per thread in both dimensions
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -95,11 +97,20 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
|
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
|
||||||
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
|
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
|
||||||
if (id_two < dest_two && id_one < dest_one) {
|
|
||||||
|
// Masking in case of triangular matrices: updates only the upper or lower part
|
||||||
|
bool condition = true;
|
||||||
|
if (upper == 1) { condition = (id_two >= id_one); }
|
||||||
|
else if (lower == 1) { condition = (id_two <= id_one); }
|
||||||
|
if (condition) {
|
||||||
|
|
||||||
// Copies the value into the destination matrix. This is always within bounds of the source
|
// Copies the value into the destination matrix. This is always within bounds of the source
|
||||||
// matrix, as we know that the destination matrix is smaller than the source.
|
// matrix, as we know that the destination matrix is smaller than the source.
|
||||||
dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
|
if (id_two < dest_two && id_one < dest_one) {
|
||||||
|
real value = src[id_two*src_ld + id_one + src_offset];
|
||||||
|
if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
|
||||||
|
dest[id_two*dest_ld + id_one + dest_offset] = value;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -127,15 +138,15 @@ __kernel void SymmLowerToSquared(const int src_dim,
|
||||||
if (id_two < dest_dim && id_one < dest_dim) {
|
if (id_two < dest_dim && id_one < dest_dim) {
|
||||||
|
|
||||||
// Loads data from the lower-symmetric matrix
|
// Loads data from the lower-symmetric matrix
|
||||||
real value;
|
real result;
|
||||||
SetToZero(value);
|
SetToZero(result);
|
||||||
if (id_two < src_dim && id_one < src_dim) {
|
if (id_two < src_dim && id_one < src_dim) {
|
||||||
if (id_two <= id_one) { value = src[id_two*src_ld + id_one + src_offset]; }
|
if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
|
||||||
else { value = src[id_one*src_ld + id_two + src_offset]; }
|
else { result = src[id_one*src_ld + id_two + src_offset]; }
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stores the value in the destination matrix
|
// Stores the result in the destination matrix
|
||||||
dest[id_two*dest_ld + id_one + dest_offset] = value;
|
dest[id_two*dest_ld + id_one + dest_offset] = result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -160,15 +171,171 @@ __kernel void SymmUpperToSquared(const int src_dim,
|
||||||
if (id_two < dest_dim && id_one < dest_dim) {
|
if (id_two < dest_dim && id_one < dest_dim) {
|
||||||
|
|
||||||
// Loads data from the upper-symmetric matrix
|
// Loads data from the upper-symmetric matrix
|
||||||
real value;
|
real result;
|
||||||
SetToZero(value);
|
SetToZero(result);
|
||||||
if (id_two < src_dim && id_one < src_dim) {
|
if (id_two < src_dim && id_one < src_dim) {
|
||||||
if (id_one <= id_two) { value = src[id_two*src_ld + id_one + src_offset]; }
|
if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
|
||||||
else { value = src[id_one*src_ld + id_two + src_offset]; }
|
else { result = src[id_one*src_ld + id_two + src_offset]; }
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stores the value in the destination matrix
|
// Stores the result in the destination matrix
|
||||||
dest[id_two*dest_ld + id_one + dest_offset] = value;
|
dest[id_two*dest_ld + id_one + dest_offset] = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
|
|
||||||
|
// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
|
||||||
|
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
|
||||||
|
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||||
|
__kernel void HermLowerToSquared(const int src_dim,
|
||||||
|
const int src_ld, const int src_offset,
|
||||||
|
__global const real* restrict src,
|
||||||
|
const int dest_dim,
|
||||||
|
const int dest_ld, const int dest_offset,
|
||||||
|
__global real* dest) {
|
||||||
|
|
||||||
|
// Loops over the work per thread in both dimensions
|
||||||
|
#pragma unroll
|
||||||
|
for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
|
||||||
|
const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
|
||||||
|
#pragma unroll
|
||||||
|
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
|
||||||
|
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
|
||||||
|
if (id_two < dest_dim && id_one < dest_dim) {
|
||||||
|
|
||||||
|
// Loads data from the lower-hermitian matrix
|
||||||
|
real result;
|
||||||
|
SetToZero(result);
|
||||||
|
if (id_two < src_dim && id_one < src_dim) {
|
||||||
|
if (id_two <= id_one) {
|
||||||
|
result = src[id_two*src_ld + id_one + src_offset];
|
||||||
|
if (id_one == id_two) { result.y = ZERO; }
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
result = src[id_one*src_ld + id_two + src_offset];
|
||||||
|
COMPLEX_CONJUGATE(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the result in the destination matrix
|
||||||
|
dest[id_two*dest_ld + id_one + dest_offset] = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Same as above, but now the matrix' data is stored in the upper-triangle
|
||||||
|
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||||
|
__kernel void HermUpperToSquared(const int src_dim,
|
||||||
|
const int src_ld, const int src_offset,
|
||||||
|
__global const real* restrict src,
|
||||||
|
const int dest_dim,
|
||||||
|
const int dest_ld, const int dest_offset,
|
||||||
|
__global real* dest) {
|
||||||
|
|
||||||
|
// Loops over the work per thread in both dimensions
|
||||||
|
#pragma unroll
|
||||||
|
for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
|
||||||
|
const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
|
||||||
|
#pragma unroll
|
||||||
|
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
|
||||||
|
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
|
||||||
|
if (id_two < dest_dim && id_one < dest_dim) {
|
||||||
|
|
||||||
|
// Loads data from the upper-hermitian matrix
|
||||||
|
real result;
|
||||||
|
SetToZero(result);
|
||||||
|
if (id_two < src_dim && id_one < src_dim) {
|
||||||
|
if (id_one <= id_two) {
|
||||||
|
result = src[id_two*src_ld + id_one + src_offset];
|
||||||
|
if (id_one == id_two) { result.y = ZERO; }
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
result = src[id_one*src_ld + id_two + src_offset];
|
||||||
|
COMPLEX_CONJUGATE(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the result in the destination matrix
|
||||||
|
dest[id_two*dest_ld + id_one + dest_offset] = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
|
||||||
|
// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
|
||||||
|
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||||
|
__kernel void TrmmLowerToSquared(const int src_dim,
|
||||||
|
const int src_ld, const int src_offset,
|
||||||
|
__global const real* restrict src,
|
||||||
|
const int dest_dim,
|
||||||
|
const int dest_ld, const int dest_offset,
|
||||||
|
__global real* dest,
|
||||||
|
const int unit_diagonal) {
|
||||||
|
|
||||||
|
// Loops over the work per thread in both dimensions
|
||||||
|
#pragma unroll
|
||||||
|
for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
|
||||||
|
const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
|
||||||
|
#pragma unroll
|
||||||
|
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
|
||||||
|
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
|
||||||
|
if (id_two < dest_dim && id_one < dest_dim) {
|
||||||
|
|
||||||
|
// Loads data from the lower-triangular matrix
|
||||||
|
real result;
|
||||||
|
SetToZero(result);
|
||||||
|
if (id_two < src_dim && id_one < src_dim) {
|
||||||
|
if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
|
||||||
|
if (id_two == id_one && unit_diagonal) { SetToOne(result); }
|
||||||
|
// Else: result is zero
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the result in the destination matrix
|
||||||
|
dest[id_two*dest_ld + id_one + dest_offset] = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Same as above, but now the matrix' data is stored in the upper-triangle
|
||||||
|
__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
|
||||||
|
__kernel void TrmmUpperToSquared(const int src_dim,
|
||||||
|
const int src_ld, const int src_offset,
|
||||||
|
__global const real* restrict src,
|
||||||
|
const int dest_dim,
|
||||||
|
const int dest_ld, const int dest_offset,
|
||||||
|
__global real* dest,
|
||||||
|
const int unit_diagonal) {
|
||||||
|
|
||||||
|
// Loops over the work per thread in both dimensions
|
||||||
|
#pragma unroll
|
||||||
|
for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
|
||||||
|
const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
|
||||||
|
#pragma unroll
|
||||||
|
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
|
||||||
|
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
|
||||||
|
if (id_two < dest_dim && id_one < dest_dim) {
|
||||||
|
|
||||||
|
// Loads data from the upper-triangular matrix
|
||||||
|
real result;
|
||||||
|
SetToZero(result);
|
||||||
|
if (id_two < src_dim && id_one < src_dim) {
|
||||||
|
if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
|
||||||
|
if (id_one == id_two && unit_diagonal) { SetToOne(result); }
|
||||||
|
// Else: result is zero
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the result in the destination matrix
|
||||||
|
dest[id_two*dest_ld + id_one + dest_offset] = result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -177,6 +344,6 @@ __kernel void SymmUpperToSquared(const int src_dim,
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// End of the C++11 raw string literal
|
// End of the C++11 raw string literal
|
||||||
)";
|
)"
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -100,7 +100,9 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
|
||||||
__global const real* restrict src,
|
__global const real* restrict src,
|
||||||
const int dest_one, const int dest_two,
|
const int dest_one, const int dest_two,
|
||||||
const int dest_ld, const int dest_offset,
|
const int dest_ld, const int dest_offset,
|
||||||
__global real* dest) {
|
__global real* dest,
|
||||||
|
const int upper, const int lower,
|
||||||
|
const int diagonal_imag_zero) {
|
||||||
|
|
||||||
// Local memory to store a tile of the matrix (for coalescing)
|
// Local memory to store a tile of the matrix (for coalescing)
|
||||||
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
|
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
|
||||||
|
@ -137,10 +139,18 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
|
||||||
const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
|
const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
|
||||||
const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
|
const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
|
||||||
|
|
||||||
// Stores the transposed value in the destination matrix
|
// Masking in case of triangular matrices: updates only the upper or lower part
|
||||||
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
|
bool condition = true;
|
||||||
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
|
if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
|
||||||
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
|
else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
|
||||||
|
if (condition) {
|
||||||
|
|
||||||
|
// Stores the transposed value in the destination matrix
|
||||||
|
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
|
||||||
|
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
|
||||||
|
if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
|
||||||
|
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -149,6 +159,6 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// End of the C++11 raw string literal
|
// End of the C++11 raw string literal
|
||||||
)";
|
)"
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -20,13 +20,16 @@ R"(
|
||||||
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
||||||
// this kernel file is used outside of the CLBlast library.
|
// this kernel file is used outside of the CLBlast library.
|
||||||
#ifndef TRA_DIM
|
#ifndef TRA_DIM
|
||||||
#define TRA_DIM 8 // Number of local threads in the two dimensions (x,y)
|
#define TRA_DIM 8 // Number of local threads in the two dimensions (x,y)
|
||||||
#endif
|
#endif
|
||||||
#ifndef TRA_WPT
|
#ifndef TRA_WPT
|
||||||
#define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other
|
#define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other
|
||||||
#endif
|
#endif
|
||||||
#ifndef TRA_PAD
|
#ifndef TRA_PAD
|
||||||
#define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts
|
#define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts
|
||||||
|
#endif
|
||||||
|
#ifndef TRA_SHUFFLE
|
||||||
|
#define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -53,116 +56,94 @@ __kernel void TransposeMatrix(const int ld,
|
||||||
__global const realT* restrict src,
|
__global const realT* restrict src,
|
||||||
__global realT* dest) {
|
__global realT* dest) {
|
||||||
|
|
||||||
// Local memory to store a tile of the matrix (for coalescing)
|
// Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
|
||||||
__local real tile[TRA_WPT*TRA_DIM][TRA_WPT*TRA_DIM + TRA_PAD];
|
// way over workgroups, breaking memory-bank dependencies.
|
||||||
|
const int gid0 = get_group_id(0);
|
||||||
|
#if TRA_SHUFFLE == 1
|
||||||
|
const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
|
||||||
|
#else
|
||||||
|
const int gid1 = get_group_id(1);
|
||||||
|
#endif
|
||||||
|
|
||||||
// Loop over the work per thread
|
// Local memory to store a tile of the matrix (for coalescing)
|
||||||
|
__local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD];
|
||||||
|
|
||||||
|
// Loops over the work per thread
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int w_one=0; w_one<TRA_WPT; ++w_one) {
|
for (int w_one=0; w_one<TRA_WPT; ++w_one) {
|
||||||
|
|
||||||
// Computes the identifiers for the source matrix. Note that the local and global dimensions
|
// Computes the identifiers for the source matrix. Note that the local and global dimensions
|
||||||
// do not correspond to each other!
|
// do not correspond to each other!
|
||||||
const int id_one = get_group_id(1) * TRA_DIM + get_local_id(0);
|
const int id_one = gid1 * TRA_DIM + get_local_id(0);
|
||||||
const int id_two = (get_group_id(0) * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
|
const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
|
||||||
|
|
||||||
// Loads data into the local memory
|
// Loads data into the local memory
|
||||||
realT value = src[id_two*(ld/TRA_WPT) + id_one];
|
realT value = src[id_two*(ld/TRA_WPT) + id_one];
|
||||||
#if TRA_WPT == 1
|
tile[get_local_id(0)*TRA_WPT + w_one][get_local_id(1)] = value;
|
||||||
tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value;
|
|
||||||
#elif TRA_WPT == 2
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
|
|
||||||
#elif TRA_WPT == 4
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.x;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.y;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.z;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.w;
|
|
||||||
#elif TRA_WPT == 8
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
|
|
||||||
#elif TRA_WPT == 16
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 0][get_local_id(0)*TRA_WPT + w_one] = value.s0;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 1][get_local_id(0)*TRA_WPT + w_one] = value.s1;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 2][get_local_id(0)*TRA_WPT + w_one] = value.s2;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 3][get_local_id(0)*TRA_WPT + w_one] = value.s3;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 4][get_local_id(0)*TRA_WPT + w_one] = value.s4;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 5][get_local_id(0)*TRA_WPT + w_one] = value.s5;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 6][get_local_id(0)*TRA_WPT + w_one] = value.s6;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 7][get_local_id(0)*TRA_WPT + w_one] = value.s7;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 8][get_local_id(0)*TRA_WPT + w_one] = value.s8;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 9][get_local_id(0)*TRA_WPT + w_one] = value.s9;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 10][get_local_id(0)*TRA_WPT + w_one] = value.sA;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 11][get_local_id(0)*TRA_WPT + w_one] = value.sB;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 12][get_local_id(0)*TRA_WPT + w_one] = value.sC;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 13][get_local_id(0)*TRA_WPT + w_one] = value.sD;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 14][get_local_id(0)*TRA_WPT + w_one] = value.sE;
|
|
||||||
tile[get_local_id(1)*TRA_WPT + 15][get_local_id(0)*TRA_WPT + w_one] = value.sF;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Synchronizes all threads in a workgroup
|
// Synchronizes all threads in a workgroup
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// Loop over the work per thread
|
// Loads transposed data from the local memory
|
||||||
|
realT v[TRA_WPT];
|
||||||
|
#pragma unroll
|
||||||
|
for (int w_one=0; w_one<TRA_WPT; ++w_one) {
|
||||||
|
v[w_one] = tile[get_local_id(1)*TRA_WPT + w_one][get_local_id(0)];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Performs the register-level transpose of the vectorized data
|
||||||
|
realT results[TRA_WPT];
|
||||||
|
#if TRA_WPT == 1
|
||||||
|
results[0] = v[0];
|
||||||
|
#elif TRA_WPT == 2
|
||||||
|
results[0] = (realT) (v[0].x, v[1].x);
|
||||||
|
results[1] = (realT) (v[0].y, v[1].y);
|
||||||
|
#elif TRA_WPT == 4
|
||||||
|
results[0] = (realT) (v[0].x, v[1].x, v[2].x, v[3].x);
|
||||||
|
results[1] = (realT) (v[0].y, v[1].y, v[2].y, v[3].y);
|
||||||
|
results[2] = (realT) (v[0].z, v[1].z, v[2].z, v[3].z);
|
||||||
|
results[3] = (realT) (v[0].w, v[1].w, v[2].w, v[3].w);
|
||||||
|
#elif TRA_WPT == 8
|
||||||
|
results[0] = (realT) (v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0);
|
||||||
|
results[1] = (realT) (v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1);
|
||||||
|
results[2] = (realT) (v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2);
|
||||||
|
results[3] = (realT) (v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3);
|
||||||
|
results[4] = (realT) (v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4);
|
||||||
|
results[5] = (realT) (v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5);
|
||||||
|
results[6] = (realT) (v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6);
|
||||||
|
results[7] = (realT) (v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7);
|
||||||
|
#elif TRA_WPT == 16
|
||||||
|
results[ 0] = (realT) (v[0].s0, v[1].s0, v[2].s0, v[3].s0, v[4].s0, v[5].s0, v[6].s0, v[7].s0, v[8].s0, v[9].s0, v[10].s0, v[11].s0, v[12].s0, v[13].s0, v[14].s0, v[15].s0);
|
||||||
|
results[ 1] = (realT) (v[0].s1, v[1].s1, v[2].s1, v[3].s1, v[4].s1, v[5].s1, v[6].s1, v[7].s1, v[8].s1, v[9].s1, v[10].s1, v[11].s1, v[12].s1, v[13].s1, v[14].s1, v[15].s1);
|
||||||
|
results[ 2] = (realT) (v[0].s2, v[1].s2, v[2].s2, v[3].s2, v[4].s2, v[5].s2, v[6].s2, v[7].s2, v[8].s2, v[9].s2, v[10].s2, v[11].s2, v[12].s2, v[13].s2, v[14].s2, v[15].s2);
|
||||||
|
results[ 3] = (realT) (v[0].s3, v[1].s3, v[2].s3, v[3].s3, v[4].s3, v[5].s3, v[6].s3, v[7].s3, v[8].s3, v[9].s3, v[10].s3, v[11].s3, v[12].s3, v[13].s3, v[14].s3, v[15].s3);
|
||||||
|
results[ 4] = (realT) (v[0].s4, v[1].s4, v[2].s4, v[3].s4, v[4].s4, v[5].s4, v[6].s4, v[7].s4, v[8].s4, v[9].s4, v[10].s4, v[11].s4, v[12].s4, v[13].s4, v[14].s4, v[15].s4);
|
||||||
|
results[ 5] = (realT) (v[0].s5, v[1].s5, v[2].s5, v[3].s5, v[4].s5, v[5].s5, v[6].s5, v[7].s5, v[8].s5, v[9].s5, v[10].s5, v[11].s5, v[12].s5, v[13].s5, v[14].s5, v[15].s5);
|
||||||
|
results[ 6] = (realT) (v[0].s6, v[1].s6, v[2].s6, v[3].s6, v[4].s6, v[5].s6, v[6].s6, v[7].s6, v[8].s6, v[9].s6, v[10].s6, v[11].s6, v[12].s6, v[13].s6, v[14].s6, v[15].s6);
|
||||||
|
results[ 7] = (realT) (v[0].s7, v[1].s7, v[2].s7, v[3].s7, v[4].s7, v[5].s7, v[6].s7, v[7].s7, v[8].s7, v[9].s7, v[10].s7, v[11].s7, v[12].s7, v[13].s7, v[14].s7, v[15].s7);
|
||||||
|
results[ 8] = (realT) (v[0].s8, v[1].s8, v[2].s8, v[3].s8, v[4].s8, v[5].s8, v[6].s8, v[7].s8, v[8].s8, v[9].s8, v[10].s8, v[11].s8, v[12].s8, v[13].s8, v[14].s8, v[15].s8);
|
||||||
|
results[ 9] = (realT) (v[0].s9, v[1].s9, v[2].s9, v[3].s9, v[4].s9, v[5].s9, v[6].s9, v[7].s9, v[8].s9, v[9].s9, v[10].s9, v[11].s9, v[12].s9, v[13].s9, v[14].s9, v[15].s9);
|
||||||
|
results[10] = (realT) (v[0].sA, v[1].sA, v[2].sA, v[3].sA, v[4].sA, v[5].sA, v[6].sA, v[7].sA, v[8].sA, v[9].sA, v[10].sA, v[11].sA, v[12].sA, v[13].sA, v[14].sA, v[15].sA);
|
||||||
|
results[11] = (realT) (v[0].sB, v[1].sB, v[2].sB, v[3].sB, v[4].sB, v[5].sB, v[6].sB, v[7].sB, v[8].sB, v[9].sB, v[10].sB, v[11].sB, v[12].sB, v[13].sB, v[14].sB, v[15].sB);
|
||||||
|
results[12] = (realT) (v[0].sC, v[1].sC, v[2].sC, v[3].sC, v[4].sC, v[5].sC, v[6].sC, v[7].sC, v[8].sC, v[9].sC, v[10].sC, v[11].sC, v[12].sC, v[13].sC, v[14].sC, v[15].sC);
|
||||||
|
results[13] = (realT) (v[0].sD, v[1].sD, v[2].sD, v[3].sD, v[4].sD, v[5].sD, v[6].sD, v[7].sD, v[8].sD, v[9].sD, v[10].sD, v[11].sD, v[12].sD, v[13].sD, v[14].sD, v[15].sD);
|
||||||
|
results[14] = (realT) (v[0].sE, v[1].sE, v[2].sE, v[3].sE, v[4].sE, v[5].sE, v[6].sE, v[7].sE, v[8].sE, v[9].sE, v[10].sE, v[11].sE, v[12].sE, v[13].sE, v[14].sE, v[15].sE);
|
||||||
|
results[15] = (realT) (v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Stores the results into the destination matrix
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int w_two=0; w_two<TRA_WPT; ++w_two) {
|
for (int w_two=0; w_two<TRA_WPT; ++w_two) {
|
||||||
|
const int id_one = gid0*TRA_DIM + get_local_id(0);
|
||||||
// Computes the identifiers for the destination matrix
|
const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
|
||||||
const int id_one = get_global_id(0);
|
dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
|
||||||
const int id_two = get_global_id(1)*TRA_WPT + w_two;
|
|
||||||
|
|
||||||
// Stores the transposed value in the destination matrix
|
|
||||||
realT value;
|
|
||||||
#if TRA_WPT == 1
|
|
||||||
value = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
|
|
||||||
#elif TRA_WPT == 2
|
|
||||||
value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
|
|
||||||
value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
|
|
||||||
#elif TRA_WPT == 4
|
|
||||||
value.x = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
|
|
||||||
value.y = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
|
|
||||||
value.z = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
|
|
||||||
value.w = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
|
|
||||||
#elif TRA_WPT == 8
|
|
||||||
value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
|
|
||||||
value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
|
|
||||||
value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
|
|
||||||
value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
|
|
||||||
value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
|
|
||||||
value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
|
|
||||||
value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
|
|
||||||
value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
|
|
||||||
#elif TRA_WPT == 16
|
|
||||||
value.s0 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 0];
|
|
||||||
value.s1 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 1];
|
|
||||||
value.s2 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 2];
|
|
||||||
value.s3 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 3];
|
|
||||||
value.s4 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 4];
|
|
||||||
value.s5 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 5];
|
|
||||||
value.s6 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 6];
|
|
||||||
value.s7 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 7];
|
|
||||||
value.s8 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 8];
|
|
||||||
value.s9 = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 9];
|
|
||||||
value.sA = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 10];
|
|
||||||
value.sB = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 11];
|
|
||||||
value.sC = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 12];
|
|
||||||
value.sD = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 13];
|
|
||||||
value.sE = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 14];
|
|
||||||
value.sF = tile[get_local_id(0)*TRA_WPT + w_two][get_local_id(1)*TRA_WPT + 15];
|
|
||||||
#endif
|
|
||||||
dest[id_two*(ld/TRA_WPT) + id_one] = value;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// End of the C++11 raw string literal
|
// End of the C++11 raw string literal
|
||||||
)";
|
)"
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -123,6 +123,6 @@ __kernel void XaxpyFast(const int n, const real alpha,
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// End of the C++11 raw string literal
|
// End of the C++11 raw string literal
|
||||||
)";
|
)"
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -127,6 +127,55 @@ R"(
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Initializes the accumulation registers to zero
|
||||||
|
inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mi=0; mi<MWI/VWM; ++mi) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int ni=0; ni<NWI; ++ni) {
|
||||||
|
#if VWM == 1
|
||||||
|
SetToZero(cpm[ni][mi]);
|
||||||
|
#elif VWM == 2
|
||||||
|
SetToZero(cpm[ni][mi].x);
|
||||||
|
SetToZero(cpm[ni][mi].y);
|
||||||
|
#elif VWM == 4
|
||||||
|
SetToZero(cpm[ni][mi].x);
|
||||||
|
SetToZero(cpm[ni][mi].y);
|
||||||
|
SetToZero(cpm[ni][mi].z);
|
||||||
|
SetToZero(cpm[ni][mi].w);
|
||||||
|
#elif VWM == 8
|
||||||
|
SetToZero(cpm[ni][mi].s0);
|
||||||
|
SetToZero(cpm[ni][mi].s1);
|
||||||
|
SetToZero(cpm[ni][mi].s2);
|
||||||
|
SetToZero(cpm[ni][mi].s3);
|
||||||
|
SetToZero(cpm[ni][mi].s4);
|
||||||
|
SetToZero(cpm[ni][mi].s5);
|
||||||
|
SetToZero(cpm[ni][mi].s6);
|
||||||
|
SetToZero(cpm[ni][mi].s7);
|
||||||
|
#elif VWM == 16
|
||||||
|
SetToZero(cpm[ni][mi].s0);
|
||||||
|
SetToZero(cpm[ni][mi].s1);
|
||||||
|
SetToZero(cpm[ni][mi].s2);
|
||||||
|
SetToZero(cpm[ni][mi].s3);
|
||||||
|
SetToZero(cpm[ni][mi].s4);
|
||||||
|
SetToZero(cpm[ni][mi].s5);
|
||||||
|
SetToZero(cpm[ni][mi].s6);
|
||||||
|
SetToZero(cpm[ni][mi].s7);
|
||||||
|
SetToZero(cpm[ni][mi].s8);
|
||||||
|
SetToZero(cpm[ni][mi].s9);
|
||||||
|
SetToZero(cpm[ni][mi].sA);
|
||||||
|
SetToZero(cpm[ni][mi].sB);
|
||||||
|
SetToZero(cpm[ni][mi].sC);
|
||||||
|
SetToZero(cpm[ni][mi].sD);
|
||||||
|
SetToZero(cpm[ni][mi].sE);
|
||||||
|
SetToZero(cpm[ni][mi].sF);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
|
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
|
||||||
// caching the A input matrix.
|
// caching the A input matrix.
|
||||||
#if SA == 1
|
#if SA == 1
|
||||||
|
@ -272,71 +321,6 @@ inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
|
|
||||||
// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
|
|
||||||
inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
|
|
||||||
const real alpha, const real beta) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int ni=0; ni<NWI; ++ni) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mi=0; mi<MWI/VWM; ++mi) {
|
|
||||||
#if STRM == 0
|
|
||||||
int mg = mi + get_local_id(0)*(MWI/VWM);
|
|
||||||
#elif STRM == 1
|
|
||||||
int mg = get_local_id(0) + mi*MDIMC;
|
|
||||||
#endif
|
|
||||||
#if STRN == 0
|
|
||||||
int ng = ni + get_local_id(1)*NWI;
|
|
||||||
#elif STRN == 1
|
|
||||||
int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
|
|
||||||
#endif
|
|
||||||
int idm = mg + get_group_id(0)*(MWG/VWM);
|
|
||||||
int idn = ng + get_group_id(1)*NWG;
|
|
||||||
int index = idn*(kSizeM/VWM) + idm;
|
|
||||||
realM cval = cgm[index];
|
|
||||||
#if VWM == 1
|
|
||||||
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
|
|
||||||
#elif VWM == 2
|
|
||||||
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
|
|
||||||
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
|
|
||||||
#elif VWM == 4
|
|
||||||
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
|
|
||||||
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
|
|
||||||
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
|
|
||||||
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
|
|
||||||
#elif VWM == 8
|
|
||||||
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
|
|
||||||
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
|
|
||||||
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
|
|
||||||
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
|
|
||||||
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
|
|
||||||
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
|
|
||||||
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
|
|
||||||
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
|
|
||||||
#elif VWM == 16
|
|
||||||
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
|
|
||||||
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
|
|
||||||
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
|
|
||||||
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
|
|
||||||
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
|
|
||||||
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
|
|
||||||
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
|
|
||||||
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
|
|
||||||
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
|
|
||||||
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
|
|
||||||
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
|
|
||||||
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
|
|
||||||
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
|
|
||||||
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
|
|
||||||
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
|
|
||||||
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// The vectorised multiply-add function
|
// The vectorised multiply-add function
|
||||||
inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
|
inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
|
||||||
#if USE_VECTOR_MAD == 1
|
#if USE_VECTOR_MAD == 1
|
||||||
|
@ -432,77 +416,97 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Main entry of the kernel. This function contains the basic skeleton, the functionality is
|
// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
|
||||||
// provided by the inlined functions above
|
// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
|
||||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
|
||||||
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
const real alpha, const real beta) {
|
||||||
const real alpha, const real beta,
|
#pragma unroll
|
||||||
const __global realM* restrict agm,
|
for (int ni=0; ni<NWI; ++ni) {
|
||||||
const __global realN* restrict bgm,
|
#pragma unroll
|
||||||
__global realM* cgm) {
|
for (int mi=0; mi<MWI/VWM; ++mi) {
|
||||||
|
#if STRM == 0
|
||||||
|
int mg = mi + get_local_id(0)*(MWI/VWM);
|
||||||
|
#elif STRM == 1
|
||||||
|
int mg = get_local_id(0) + mi*MDIMC;
|
||||||
|
#endif
|
||||||
|
#if STRN == 0
|
||||||
|
int ng = ni + get_local_id(1)*NWI;
|
||||||
|
#elif STRN == 1
|
||||||
|
int ng = ni%VWN + get_local_id(1)*VWN + (ni/VWN)*VWN*NDIMC;
|
||||||
|
#endif
|
||||||
|
int idm = mg + get_group_id(0)*(MWG/VWM);
|
||||||
|
int idn = ng + get_group_id(1)*NWG;
|
||||||
|
|
||||||
// Combined thread identifier
|
// The final multiplication with alpha and the addition with beta*C
|
||||||
|
int index = idn*(kSizeM/VWM) + idm;
|
||||||
|
realM cval = cgm[index];
|
||||||
|
#if VWM == 1
|
||||||
|
AXPBY(cgm[index], alpha, cpm[ni][mi], beta, cval);
|
||||||
|
#elif VWM == 2
|
||||||
|
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
|
||||||
|
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
|
||||||
|
#elif VWM == 4
|
||||||
|
AXPBY(cgm[index].x, alpha, cpm[ni][mi].x, beta, cval.x);
|
||||||
|
AXPBY(cgm[index].y, alpha, cpm[ni][mi].y, beta, cval.y);
|
||||||
|
AXPBY(cgm[index].z, alpha, cpm[ni][mi].z, beta, cval.z);
|
||||||
|
AXPBY(cgm[index].w, alpha, cpm[ni][mi].w, beta, cval.w);
|
||||||
|
#elif VWM == 8
|
||||||
|
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
|
||||||
|
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
|
||||||
|
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
|
||||||
|
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
|
||||||
|
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
|
||||||
|
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
|
||||||
|
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
|
||||||
|
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
|
||||||
|
#elif VWM == 16
|
||||||
|
AXPBY(cgm[index].s0, alpha, cpm[ni][mi].s0, beta, cval.s0);
|
||||||
|
AXPBY(cgm[index].s1, alpha, cpm[ni][mi].s1, beta, cval.s1);
|
||||||
|
AXPBY(cgm[index].s2, alpha, cpm[ni][mi].s2, beta, cval.s2);
|
||||||
|
AXPBY(cgm[index].s3, alpha, cpm[ni][mi].s3, beta, cval.s3);
|
||||||
|
AXPBY(cgm[index].s4, alpha, cpm[ni][mi].s4, beta, cval.s4);
|
||||||
|
AXPBY(cgm[index].s5, alpha, cpm[ni][mi].s5, beta, cval.s5);
|
||||||
|
AXPBY(cgm[index].s6, alpha, cpm[ni][mi].s6, beta, cval.s6);
|
||||||
|
AXPBY(cgm[index].s7, alpha, cpm[ni][mi].s7, beta, cval.s7);
|
||||||
|
AXPBY(cgm[index].s8, alpha, cpm[ni][mi].s8, beta, cval.s8);
|
||||||
|
AXPBY(cgm[index].s9, alpha, cpm[ni][mi].s9, beta, cval.s9);
|
||||||
|
AXPBY(cgm[index].sA, alpha, cpm[ni][mi].sA, beta, cval.sA);
|
||||||
|
AXPBY(cgm[index].sB, alpha, cpm[ni][mi].sB, beta, cval.sB);
|
||||||
|
AXPBY(cgm[index].sC, alpha, cpm[ni][mi].sC, beta, cval.sC);
|
||||||
|
AXPBY(cgm[index].sD, alpha, cpm[ni][mi].sD, beta, cval.sD);
|
||||||
|
AXPBY(cgm[index].sE, alpha, cpm[ni][mi].sE, beta, cval.sE);
|
||||||
|
AXPBY(cgm[index].sF, alpha, cpm[ni][mi].sF, beta, cval.sF);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
|
||||||
|
inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||||
|
const __global realM* restrict agm, const __global realN* restrict bgm,
|
||||||
|
__global realM* cgm, realM cpm[NWI][MWI/VWM]
|
||||||
|
#if SA == 1 && SB == 1
|
||||||
|
, __local realM* alm, __local realN* blm
|
||||||
|
#elif SA == 1
|
||||||
|
, __local realM* alm
|
||||||
|
#elif SB == 1
|
||||||
|
, __local realN* blm
|
||||||
|
#endif
|
||||||
|
) {
|
||||||
|
|
||||||
|
// Allocates workitem-private memory (registers)
|
||||||
|
realM apm[MWI/VWM];
|
||||||
|
realN bpm[NWI/VWN];
|
||||||
|
|
||||||
|
// Combined thread identifier (volatile to disable caching)
|
||||||
#if SA == 1 || SB == 1
|
#if SA == 1 || SB == 1
|
||||||
volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
|
volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Allocates workgroup-private memory (local memory)
|
|
||||||
#if SA == 1
|
|
||||||
__local realM alm[KWG * MWG/VWM];
|
|
||||||
#endif
|
|
||||||
#if SB == 1
|
|
||||||
__local realN blm[KWG * NWG/VWN];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Allocates workitem-private memory (registers)
|
|
||||||
realM apm[MWI/VWM];
|
|
||||||
realN bpm[NWI/VWN];
|
|
||||||
realM cpm[NWI][MWI/VWM];
|
|
||||||
|
|
||||||
// Initializes the accumulation registers
|
// Initializes the accumulation registers
|
||||||
#pragma unroll
|
InitAccRegisters(cpm);
|
||||||
for (int mi=0; mi<MWI/VWM; ++mi) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int ni=0; ni<NWI; ++ni) {
|
|
||||||
#if VWM == 1
|
|
||||||
SetToZero(cpm[ni][mi]);
|
|
||||||
#elif VWM == 2
|
|
||||||
SetToZero(cpm[ni][mi].x);
|
|
||||||
SetToZero(cpm[ni][mi].y);
|
|
||||||
#elif VWM == 4
|
|
||||||
SetToZero(cpm[ni][mi].x);
|
|
||||||
SetToZero(cpm[ni][mi].y);
|
|
||||||
SetToZero(cpm[ni][mi].z);
|
|
||||||
SetToZero(cpm[ni][mi].w);
|
|
||||||
#elif VWM == 8
|
|
||||||
SetToZero(cpm[ni][mi].s0);
|
|
||||||
SetToZero(cpm[ni][mi].s1);
|
|
||||||
SetToZero(cpm[ni][mi].s2);
|
|
||||||
SetToZero(cpm[ni][mi].s3);
|
|
||||||
SetToZero(cpm[ni][mi].s4);
|
|
||||||
SetToZero(cpm[ni][mi].s5);
|
|
||||||
SetToZero(cpm[ni][mi].s6);
|
|
||||||
SetToZero(cpm[ni][mi].s7);
|
|
||||||
#elif VWM == 16
|
|
||||||
SetToZero(cpm[ni][mi].s0);
|
|
||||||
SetToZero(cpm[ni][mi].s1);
|
|
||||||
SetToZero(cpm[ni][mi].s2);
|
|
||||||
SetToZero(cpm[ni][mi].s3);
|
|
||||||
SetToZero(cpm[ni][mi].s4);
|
|
||||||
SetToZero(cpm[ni][mi].s5);
|
|
||||||
SetToZero(cpm[ni][mi].s6);
|
|
||||||
SetToZero(cpm[ni][mi].s7);
|
|
||||||
SetToZero(cpm[ni][mi].s8);
|
|
||||||
SetToZero(cpm[ni][mi].s9);
|
|
||||||
SetToZero(cpm[ni][mi].sA);
|
|
||||||
SetToZero(cpm[ni][mi].sB);
|
|
||||||
SetToZero(cpm[ni][mi].sC);
|
|
||||||
SetToZero(cpm[ni][mi].sD);
|
|
||||||
SetToZero(cpm[ni][mi].sE);
|
|
||||||
SetToZero(cpm[ni][mi].sF);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Loops over all workgroup tiles
|
// Loops over all workgroup tiles
|
||||||
for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
|
for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
|
||||||
|
@ -515,8 +519,6 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||||
#if SB == 1
|
#if SB == 1
|
||||||
GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
|
GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Synchronizes all threads in a workgroup
|
|
||||||
#if SA == 1 || SB == 1
|
#if SA == 1 || SB == 1
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
#endif
|
#endif
|
||||||
|
@ -552,20 +554,130 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||||
MultiplyAccumulate(cpm, apm, bpm);
|
MultiplyAccumulate(cpm, apm, bpm);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Synchronizes all threads in a workgroup
|
|
||||||
#if SA == 1 || SB == 1
|
#if SA == 1 || SB == 1
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta
|
|
||||||
StoreResults(cgm, cpm, kSizeM, alpha, beta);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
// The upper-triangular and lower-triangular kernels are only used in special cases
|
||||||
|
#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
|
||||||
|
|
||||||
// End of the C++11 raw string literal
|
// Main entry point of the kernel. This is the upper-triangular version.
|
||||||
)";
|
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||||
|
__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
|
||||||
|
const real alpha, const real beta,
|
||||||
|
const __global realM* restrict agm,
|
||||||
|
const __global realN* restrict bgm,
|
||||||
|
__global realM* cgm) {
|
||||||
|
|
||||||
|
// Skip these threads if they do not contain threads contributing to the upper-triangle
|
||||||
|
if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocates workgroup-private memory (local memory)
|
||||||
|
#if SA == 1
|
||||||
|
__local realM alm[KWG * MWG/VWM];
|
||||||
|
#endif
|
||||||
|
#if SB == 1
|
||||||
|
__local realN blm[KWG * NWG/VWN];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Computes the matrix-multiplication and stores the result in register memory
|
||||||
|
realM cpm[NWI][MWI/VWM];
|
||||||
|
#if SA == 1 && SB == 1
|
||||||
|
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
|
||||||
|
#elif SA == 1
|
||||||
|
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
|
||||||
|
#elif SB == 1
|
||||||
|
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
|
||||||
|
#else
|
||||||
|
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
|
||||||
|
StoreResults(cgm, cpm, kSizeN, alpha, beta);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Main entry point of the kernel. This is the lower-triangular version.
|
||||||
|
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||||
|
__kernel void XgemmLower(const int kSizeN, const int kSizeK,
|
||||||
|
const real alpha, const real beta,
|
||||||
|
const __global realM* restrict agm,
|
||||||
|
const __global realN* restrict bgm,
|
||||||
|
__global realM* cgm) {
|
||||||
|
|
||||||
|
// Skip these threads if they do not contain threads contributing to the lower-triangle
|
||||||
|
if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocates workgroup-private memory (local memory)
|
||||||
|
#if SA == 1
|
||||||
|
__local realM alm[KWG * MWG/VWM];
|
||||||
|
#endif
|
||||||
|
#if SB == 1
|
||||||
|
__local realN blm[KWG * NWG/VWN];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Computes the matrix-multiplication and stores the result in register memory
|
||||||
|
realM cpm[NWI][MWI/VWM];
|
||||||
|
#if SA == 1 && SB == 1
|
||||||
|
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
|
||||||
|
#elif SA == 1
|
||||||
|
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
|
||||||
|
#elif SB == 1
|
||||||
|
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
|
||||||
|
#else
|
||||||
|
XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
|
||||||
|
StoreResults(cgm, cpm, kSizeN, alpha, beta);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// If not using a triangular version, include the regular kernel
|
||||||
|
#else
|
||||||
|
|
||||||
|
// Main entry point of the kernel. This is the regular full version.
|
||||||
|
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||||
|
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||||
|
const real alpha, const real beta,
|
||||||
|
const __global realM* restrict agm,
|
||||||
|
const __global realN* restrict bgm,
|
||||||
|
__global realM* cgm) {
|
||||||
|
|
||||||
|
// Allocates workgroup-private memory (local memory)
|
||||||
|
#if SA == 1
|
||||||
|
__local realM alm[KWG * MWG/VWM];
|
||||||
|
#endif
|
||||||
|
#if SB == 1
|
||||||
|
__local realN blm[KWG * NWG/VWN];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Computes the matrix-multiplication and stores the result in register memory
|
||||||
|
realM cpm[NWI][MWI/VWM];
|
||||||
|
#if SA == 1 && SB == 1
|
||||||
|
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
|
||||||
|
#elif SA == 1
|
||||||
|
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
|
||||||
|
#elif SB == 1
|
||||||
|
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
|
||||||
|
#else
|
||||||
|
XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
|
||||||
|
StoreResults(cgm, cpm, kSizeM, alpha, beta);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// End of the C++11 raw string literal
|
||||||
|
)"
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -368,6 +368,6 @@ __kernel void XgemvFastRot(const int m, const int n, const real alpha, const rea
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// End of the C++11 raw string literal
|
// End of the C++11 raw string literal
|
||||||
)";
|
)"
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -22,9 +22,10 @@ namespace clblast {
|
||||||
std::vector<Routine::ProgramCache> Routine::program_cache_;
|
std::vector<Routine::ProgramCache> Routine::program_cache_;
|
||||||
|
|
||||||
// Constructor: not much here, because no status codes can be returned
|
// Constructor: not much here, because no status codes can be returned
|
||||||
Routine::Routine(CommandQueue &queue, Event &event,
|
Routine::Routine(CommandQueue &queue, Event &event, const std::string &name,
|
||||||
const std::vector<std::string> &routines, const Precision precision):
|
const std::vector<std::string> &routines, const Precision precision):
|
||||||
precision_(precision),
|
precision_(precision),
|
||||||
|
routine_name_(name),
|
||||||
queue_(queue),
|
queue_(queue),
|
||||||
event_(event),
|
event_(event),
|
||||||
context_(queue_.GetContext()),
|
context_(queue_.GetContext()),
|
||||||
|
@ -33,14 +34,13 @@ Routine::Routine(CommandQueue &queue, Event &event,
|
||||||
max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
|
max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
|
||||||
max_work_item_sizes_(device_.MaxWorkItemSizes()),
|
max_work_item_sizes_(device_.MaxWorkItemSizes()),
|
||||||
max_work_group_size_(device_.MaxWorkGroupSize()),
|
max_work_group_size_(device_.MaxWorkGroupSize()),
|
||||||
db_(queue_, routines, precision_),
|
db_(queue_, routines, precision_) {
|
||||||
routines_(routines) {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Separate set-up function to allow for status codes to be returned
|
// Separate set-up function to allow for status codes to be returned
|
||||||
StatusCode Routine::SetUp(const std::string &routine_source) {
|
StatusCode Routine::SetUp() {
|
||||||
|
|
||||||
// Queries the cache to see whether or not the compiled kernel is already there. If not, it will
|
// Queries the cache to see whether or not the compiled kernel is already there. If not, it will
|
||||||
// be built and added to the cache.
|
// be built and added to the cache.
|
||||||
|
@ -63,12 +63,24 @@ StatusCode Routine::SetUp(const std::string &routine_source) {
|
||||||
|
|
||||||
// Loads the common header (typedefs and defines and such)
|
// Loads the common header (typedefs and defines and such)
|
||||||
std::string common_header =
|
std::string common_header =
|
||||||
#include "kernels/common.opencl"
|
#include "kernels/common.opencl"
|
||||||
|
;
|
||||||
|
|
||||||
// Collects the parameters for this device in the form of defines, and adds the precision
|
// Collects the parameters for this device in the form of defines, and adds the precision
|
||||||
auto defines = db_.GetDefines();
|
auto defines = db_.GetDefines();
|
||||||
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
|
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
|
||||||
auto source_string = defines + common_header + routine_source;
|
|
||||||
|
// Adds the name of the routine as a define
|
||||||
|
defines += "#define ROUTINE_"+routine_name_+"\n";
|
||||||
|
|
||||||
|
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
|
||||||
|
// performance, but might result in a reduced accuracy.
|
||||||
|
if (device_.Vendor() == "AMD") {
|
||||||
|
defines += "#define USE_CL_MAD 1\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Combines everything together into a single source string
|
||||||
|
auto source_string = defines + common_header + source_string_;
|
||||||
|
|
||||||
// Compiles the kernel
|
// Compiles the kernel
|
||||||
try {
|
try {
|
||||||
|
@ -85,7 +97,7 @@ StatusCode Routine::SetUp(const std::string &routine_source) {
|
||||||
if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; }
|
if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; }
|
||||||
|
|
||||||
// Store the compiled program in the cache
|
// Store the compiled program in the cache
|
||||||
program_cache_.push_back({program, device_name_, precision_, routines_});
|
program_cache_.push_back({program, device_name_, precision_, routine_name_});
|
||||||
} catch (...) { return StatusCode::kBuildProgramFailure; }
|
} catch (...) { return StatusCode::kBuildProgramFailure; }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -202,19 +214,22 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Copies a matrix and pads it with zeros
|
// Copies or transposes a matrix and pads/unpads it with zeros
|
||||||
StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
|
StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
|
||||||
const size_t src_ld, const size_t src_offset,
|
const size_t src_ld, const size_t src_offset,
|
||||||
const Buffer &src,
|
const Buffer &src,
|
||||||
const size_t dest_one, const size_t dest_two,
|
const size_t dest_one, const size_t dest_two,
|
||||||
const size_t dest_ld, const size_t dest_offset,
|
const size_t dest_ld, const size_t dest_offset,
|
||||||
const Buffer &dest,
|
const Buffer &dest,
|
||||||
|
const Program &program, const bool do_pad,
|
||||||
const bool do_transpose, const bool do_conjugate,
|
const bool do_transpose, const bool do_conjugate,
|
||||||
const bool pad, const Program &program) {
|
const bool upper, const bool lower,
|
||||||
|
const bool diagonal_imag_zero) {
|
||||||
|
|
||||||
// Determines whether or not the fast-version could potentially be used
|
// Determines whether or not the fast-version could potentially be used
|
||||||
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
|
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
|
||||||
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
|
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
|
||||||
|
(upper == false) && (lower == false) && (diagonal_imag_zero == false);
|
||||||
|
|
||||||
// Determines the right kernel
|
// Determines the right kernel
|
||||||
auto kernel_name = std::string{};
|
auto kernel_name = std::string{};
|
||||||
|
@ -227,7 +242,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
use_fast_kernel = false;
|
use_fast_kernel = false;
|
||||||
kernel_name = (pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
|
kernel_name = (do_pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -239,7 +254,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
use_fast_kernel = false;
|
use_fast_kernel = false;
|
||||||
kernel_name = (pad) ? "PadMatrix" : "UnPadMatrix";
|
kernel_name = (do_pad) ? "PadMatrix" : "UnPadMatrix";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -264,9 +279,14 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
|
||||||
kernel.SetArgument(7, static_cast<int>(dest_ld));
|
kernel.SetArgument(7, static_cast<int>(dest_ld));
|
||||||
kernel.SetArgument(8, static_cast<int>(dest_offset));
|
kernel.SetArgument(8, static_cast<int>(dest_offset));
|
||||||
kernel.SetArgument(9, dest());
|
kernel.SetArgument(9, dest());
|
||||||
if (pad) {
|
if (do_pad) {
|
||||||
kernel.SetArgument(10, static_cast<int>(do_conjugate));
|
kernel.SetArgument(10, static_cast<int>(do_conjugate));
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
kernel.SetArgument(10, static_cast<int>(upper));
|
||||||
|
kernel.SetArgument(11, static_cast<int>(lower));
|
||||||
|
kernel.SetArgument(12, static_cast<int>(diagonal_imag_zero));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
|
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
|
||||||
|
@ -310,7 +330,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
|
||||||
// otherwise.
|
// otherwise.
|
||||||
const Program& Routine::GetProgramFromCache() const {
|
const Program& Routine::GetProgramFromCache() const {
|
||||||
for (auto &cached_program: program_cache_) {
|
for (auto &cached_program: program_cache_) {
|
||||||
if (cached_program.MatchInCache(device_name_, precision_, routines_)) {
|
if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) {
|
||||||
return cached_program.program;
|
return cached_program.program;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -320,7 +340,7 @@ const Program& Routine::GetProgramFromCache() const {
|
||||||
// Queries the cache to see whether or not the compiled kernel is already there
|
// Queries the cache to see whether or not the compiled kernel is already there
|
||||||
bool Routine::ProgramIsInCache() const {
|
bool Routine::ProgramIsInCache() const {
|
||||||
for (auto &cached_program: program_cache_) {
|
for (auto &cached_program: program_cache_) {
|
||||||
if (cached_program.MatchInCache(device_name_, precision_, routines_)) { return true; }
|
if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; }
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
#include "internal/routines/xaxpy.h"
|
#include "internal/routines/level1/xaxpy.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -30,7 +30,10 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
|
Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
|
||||||
Routine(queue, event, {"Xaxpy"}, precision_) {
|
Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/xaxpy.opencl"
|
||||||
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
#include "internal/routines/xgemv.h"
|
#include "internal/routines/level2/xgemv.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -30,7 +30,10 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
|
Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
|
||||||
Routine(queue, event, {"Xgemv"}, precision_) {
|
Routine(queue, event, "GEMV", {"Xgemv"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/xgemv.opencl"
|
||||||
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
|
@ -11,7 +11,7 @@
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
#include "internal/routines/xgemm.h"
|
#include "internal/routines/level3/xgemm.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -30,7 +30,14 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
|
Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
|
||||||
Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
|
Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/copy.opencl"
|
||||||
|
#include "../../kernels/pad.opencl"
|
||||||
|
#include "../../kernels/transpose.opencl"
|
||||||
|
#include "../../kernels/padtranspose.opencl"
|
||||||
|
#include "../../kernels/xgemm.opencl"
|
||||||
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -95,31 +102,48 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
||||||
auto n_ceiled = Ceil(n, db_["NWG"]);
|
auto n_ceiled = Ceil(n, db_["NWG"]);
|
||||||
auto k_ceiled = Ceil(k, db_["KWG"]);
|
auto k_ceiled = Ceil(k, db_["KWG"]);
|
||||||
|
|
||||||
// Allocates space on the device for padded and/or transposed input and output matrices.
|
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
|
||||||
try {
|
try {
|
||||||
auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
|
|
||||||
auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
|
||||||
auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
|
|
||||||
|
|
||||||
// Loads the program from the database
|
// Loads the program from the database
|
||||||
auto& program = GetProgramFromCache();
|
auto& program = GetProgramFromCache();
|
||||||
|
|
||||||
// Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
|
// Determines whether or not temporary matrices are needed
|
||||||
// them up until they reach a certain multiple of size (kernel parameter dependent).
|
auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
|
||||||
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
|
a_do_transpose == false && a_conjugate == false;
|
||||||
m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
|
auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
|
||||||
a_do_transpose, a_conjugate, true, program);
|
b_do_transpose == false && b_conjugate == false;
|
||||||
if (ErrorIn(status)) { return status; }
|
auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 &&
|
||||||
status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
|
c_do_transpose == false;
|
||||||
n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
|
|
||||||
b_do_transpose, b_conjugate, true, program);
|
|
||||||
if (ErrorIn(status)) { return status; }
|
|
||||||
|
|
||||||
// Only necessary for matrix C if it used both as input and output
|
// Creates the temporary matrices
|
||||||
if (beta != static_cast<T>(0)) {
|
auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
|
||||||
|
auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||||
|
auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
|
||||||
|
|
||||||
|
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
|
||||||
|
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
|
||||||
|
// case nothing has to be done, these kernels can be skipped.
|
||||||
|
if (!a_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
|
||||||
|
m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
|
||||||
|
program, true, a_do_transpose, a_conjugate);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// As above, but now for matrix B
|
||||||
|
if (!b_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
|
||||||
|
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
|
||||||
|
program, true, b_do_transpose, b_conjugate);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// As above, but now for matrix C. This is only necessary if C is used both as input and output.
|
||||||
|
if (!c_no_temp && beta != static_cast<T>(0)) {
|
||||||
status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
|
status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
|
||||||
m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
|
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
|
||||||
c_do_transpose, false, true, program);
|
program, true, c_do_transpose, false);
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -133,9 +157,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
||||||
kernel.SetArgument(2, static_cast<int>(k_ceiled));
|
kernel.SetArgument(2, static_cast<int>(k_ceiled));
|
||||||
kernel.SetArgument(3, alpha);
|
kernel.SetArgument(3, alpha);
|
||||||
kernel.SetArgument(4, beta);
|
kernel.SetArgument(4, beta);
|
||||||
kernel.SetArgument(5, temp_a());
|
kernel.SetArgument(5, a_temp());
|
||||||
kernel.SetArgument(6, temp_b());
|
kernel.SetArgument(6, b_temp());
|
||||||
kernel.SetArgument(7, temp_c());
|
kernel.SetArgument(7, c_temp());
|
||||||
|
|
||||||
// Computes the global and local thread sizes
|
// Computes the global and local thread sizes
|
||||||
auto global = std::vector<size_t>{
|
auto global = std::vector<size_t>{
|
||||||
|
@ -148,11 +172,13 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local);
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
// Runs the post-processing kernel
|
// Runs the post-processing kernel if needed
|
||||||
status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
|
if (!c_no_temp) {
|
||||||
c_one, c_two, c_ld, c_offset, c_buffer,
|
status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
|
||||||
c_do_transpose, false, false, program);
|
c_one, c_two, c_ld, c_offset, c_buffer,
|
||||||
if (ErrorIn(status)) { return status; }
|
program, false, c_do_transpose, false);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
|
||||||
// Successfully finished the computation
|
// Successfully finished the computation
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
130
src/routines/level3/xhemm.cc
Normal file
130
src/routines/level3/xhemm.cc
Normal file
|
@ -0,0 +1,130 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xhemm class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level3/xhemm.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xhemm<T>::Xhemm(CommandQueue &queue, Event &event):
|
||||||
|
Xgemm<T>(queue, event) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
const T beta,
|
||||||
|
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
|
||||||
|
|
||||||
|
// Makes sure all dimensions are larger than zero
|
||||||
|
if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
|
||||||
|
// left) or B (on the right) in the Xgemm routine.
|
||||||
|
auto k = (side == Side::kLeft) ? m : n;
|
||||||
|
|
||||||
|
// Checks for validity of the squared A matrix
|
||||||
|
auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
|
||||||
|
// default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
|
||||||
|
bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
|
||||||
|
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
|
||||||
|
auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
|
||||||
|
|
||||||
|
// Temporary buffer for a copy of the hermitian matrix
|
||||||
|
try {
|
||||||
|
auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
|
||||||
|
|
||||||
|
// Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
|
||||||
|
// routine afterwards
|
||||||
|
try {
|
||||||
|
auto& program = GetProgramFromCache();
|
||||||
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
|
// Sets the arguments for the hermitian-to-squared kernel
|
||||||
|
kernel.SetArgument(0, static_cast<int>(k));
|
||||||
|
kernel.SetArgument(1, static_cast<int>(a_ld));
|
||||||
|
kernel.SetArgument(2, static_cast<int>(a_offset));
|
||||||
|
kernel.SetArgument(3, a_buffer());
|
||||||
|
kernel.SetArgument(4, static_cast<int>(k));
|
||||||
|
kernel.SetArgument(5, static_cast<int>(k));
|
||||||
|
kernel.SetArgument(6, static_cast<int>(0));
|
||||||
|
kernel.SetArgument(7, temp_herm());
|
||||||
|
|
||||||
|
// Uses the common padding kernel's thread configuration. This is allowed, since the
|
||||||
|
// hermitian-to-squared kernel uses the same parameters.
|
||||||
|
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
|
||||||
|
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
|
||||||
|
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Runs the regular Xgemm code with either "C := AB+C" or ...
|
||||||
|
if (side == Side::kLeft) {
|
||||||
|
status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
|
||||||
|
m, n, k,
|
||||||
|
alpha,
|
||||||
|
temp_herm, 0, k,
|
||||||
|
b_buffer, b_offset, b_ld,
|
||||||
|
beta,
|
||||||
|
c_buffer, c_offset, c_ld);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... with "C := BA+C". Note that A and B are now reversed.
|
||||||
|
else {
|
||||||
|
status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
|
||||||
|
m, n, k,
|
||||||
|
alpha,
|
||||||
|
b_buffer, b_offset, b_ld,
|
||||||
|
temp_herm, 0, k,
|
||||||
|
beta,
|
||||||
|
c_buffer, c_offset, c_ld);
|
||||||
|
|
||||||
|
// A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
|
||||||
|
switch(status) {
|
||||||
|
case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
|
||||||
|
case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
|
||||||
|
case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
|
||||||
|
case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
|
||||||
|
case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
|
||||||
|
case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the status of the Xgemm routine
|
||||||
|
return status;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xhemm<float2>;
|
||||||
|
template class Xhemm<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
207
src/routines/level3/xher2k.cc
Normal file
207
src/routines/level3/xher2k.cc
Normal file
|
@ -0,0 +1,207 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xher2k class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level3/xher2k.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specific implementations to get the memory-type based on a template argument
|
||||||
|
template <> const Precision Xher2k<float2,float>::precision_ = Precision::kComplexSingle;
|
||||||
|
template <> const Precision Xher2k<double2,double>::precision_ = Precision::kComplexDouble;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T, typename U>
|
||||||
|
Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event):
|
||||||
|
Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/copy.opencl"
|
||||||
|
#include "../../kernels/pad.opencl"
|
||||||
|
#include "../../kernels/transpose.opencl"
|
||||||
|
#include "../../kernels/padtranspose.opencl"
|
||||||
|
#include "../../kernels/xgemm.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T, typename U>
|
||||||
|
StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
const U beta,
|
||||||
|
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
|
||||||
|
|
||||||
|
// Makes sure all dimensions are larger than zero
|
||||||
|
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
|
||||||
|
// to matrix A (argument: conjugate transpose)
|
||||||
|
auto ab_conjugate = (ab_transpose != Transpose::kNo);
|
||||||
|
|
||||||
|
// Computes whether or not the matrices are transposed in memory. This is based on their layout
|
||||||
|
// (row or column-major) and whether or not they are requested to be pre-transposed.
|
||||||
|
auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) ||
|
||||||
|
(layout == Layout::kRowMajor && !ab_conjugate);
|
||||||
|
auto c_rotated = (layout == Layout::kRowMajor);
|
||||||
|
|
||||||
|
// Computes the first and second dimensions of the A and B matrices taking the layout into account
|
||||||
|
auto ab_one = (ab_rotated) ? k : n;
|
||||||
|
auto ab_two = (ab_rotated) ? n : k;
|
||||||
|
|
||||||
|
// Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
|
||||||
|
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
|
||||||
|
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
|
||||||
|
// space. Also tests that the leading dimensions of:
|
||||||
|
// matrix A cannot be less than N when rotated, or less than K when not-rotated
|
||||||
|
// matrix B cannot be less than N when rotated, or less than K when not-rotated
|
||||||
|
// matrix C cannot be less than N
|
||||||
|
auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Calculates the ceiled versions of n and k
|
||||||
|
auto n_ceiled = Ceil(n, db_["NWG"]);
|
||||||
|
auto k_ceiled = Ceil(k, db_["KWG"]);
|
||||||
|
|
||||||
|
// Decides which kernel to run: the upper-triangular or lower-triangular version
|
||||||
|
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
|
||||||
|
|
||||||
|
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
|
||||||
|
try {
|
||||||
|
|
||||||
|
// Loads the program from the database
|
||||||
|
auto& program = GetProgramFromCache();
|
||||||
|
|
||||||
|
// Determines whether or not temporary matrices are needed
|
||||||
|
auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
|
||||||
|
ab_rotated == false && ab_conjugate == false;
|
||||||
|
auto a2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
|
||||||
|
ab_rotated == false && ab_conjugate == true;
|
||||||
|
auto b1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
|
||||||
|
ab_rotated == false && ab_conjugate == false;
|
||||||
|
auto b2_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
|
||||||
|
ab_rotated == false && ab_conjugate == true;
|
||||||
|
|
||||||
|
// Creates the temporary matrices
|
||||||
|
auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||||
|
auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||||
|
auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||||
|
auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||||
|
auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
|
||||||
|
|
||||||
|
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
|
||||||
|
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
|
||||||
|
// case nothing has to be done, these kernels can be skipped.
|
||||||
|
if (!a1_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
|
||||||
|
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
|
||||||
|
program, true, ab_rotated, ab_conjugate);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
if (!a2_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
|
||||||
|
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
|
||||||
|
program, true, ab_rotated, !ab_conjugate);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
if (!b1_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
|
||||||
|
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
|
||||||
|
program, true, ab_rotated, ab_conjugate);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
if (!b2_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
|
||||||
|
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
|
||||||
|
program, true, ab_rotated, !ab_conjugate);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
|
||||||
|
// modify the other triangle.
|
||||||
|
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
|
||||||
|
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
|
||||||
|
program, true, c_rotated, false);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
|
||||||
|
try {
|
||||||
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
|
// Sets the kernel arguments
|
||||||
|
auto complex_beta = T{beta, static_cast<U>(0.0)};
|
||||||
|
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||||
|
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||||
|
kernel.SetArgument(2, alpha);
|
||||||
|
kernel.SetArgument(3, complex_beta);
|
||||||
|
kernel.SetArgument(4, a1_temp());
|
||||||
|
kernel.SetArgument(5, b2_temp());
|
||||||
|
kernel.SetArgument(6, c_temp());
|
||||||
|
|
||||||
|
// Computes the global and local thread sizes
|
||||||
|
auto global = std::vector<size_t>{
|
||||||
|
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
|
||||||
|
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
|
||||||
|
};
|
||||||
|
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
|
||||||
|
|
||||||
|
// Launches the kernel
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
|
||||||
|
auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
|
||||||
|
auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
|
||||||
|
kernel.SetArgument(2, conjugate_alpha);
|
||||||
|
kernel.SetArgument(3, complex_one);
|
||||||
|
kernel.SetArgument(4, b1_temp());
|
||||||
|
kernel.SetArgument(5, a2_temp());
|
||||||
|
|
||||||
|
// Runs the kernel again
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Runs the post-processing kernel
|
||||||
|
auto upper = (triangle == Triangle::kUpper);
|
||||||
|
auto lower = (triangle == Triangle::kLower);
|
||||||
|
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
|
||||||
|
n, n, c_ld, c_offset, c_buffer,
|
||||||
|
program, false, c_rotated, false, upper, lower, true);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Successfully finished the computation
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xher2k<float2,float>;
|
||||||
|
template class Xher2k<double2,double>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
175
src/routines/level3/xherk.cc
Normal file
175
src/routines/level3/xherk.cc
Normal file
|
@ -0,0 +1,175 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xherk class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level3/xherk.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specific implementations to get the memory-type based on a template argument
|
||||||
|
template <> const Precision Xherk<float2,float>::precision_ = Precision::kComplexSingle;
|
||||||
|
template <> const Precision Xherk<double2,double>::precision_ = Precision::kComplexDouble;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T, typename U>
|
||||||
|
Xherk<T,U>::Xherk(CommandQueue &queue, Event &event):
|
||||||
|
Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/copy.opencl"
|
||||||
|
#include "../../kernels/pad.opencl"
|
||||||
|
#include "../../kernels/transpose.opencl"
|
||||||
|
#include "../../kernels/padtranspose.opencl"
|
||||||
|
#include "../../kernels/xgemm.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T, typename U>
|
||||||
|
StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const U alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const U beta,
|
||||||
|
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
|
||||||
|
|
||||||
|
// Makes sure all dimensions are larger than zero
|
||||||
|
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
|
||||||
|
// to matrix A (argument: conjugate transpose)
|
||||||
|
auto a_conjugate = (a_transpose != Transpose::kNo);
|
||||||
|
auto b_conjugate = (a_transpose == Transpose::kNo);
|
||||||
|
|
||||||
|
// Computes whether or not the matrices are transposed in memory. This is based on their layout
|
||||||
|
// (row or column-major) and whether or not they are requested to be pre-transposed.
|
||||||
|
auto a_rotated = (layout == Layout::kColMajor && a_conjugate) ||
|
||||||
|
(layout == Layout::kRowMajor && !a_conjugate);
|
||||||
|
auto c_rotated = (layout == Layout::kRowMajor);
|
||||||
|
|
||||||
|
// Computes the first and second dimensions of the A matrix taking the layout into account
|
||||||
|
auto a_one = (a_rotated) ? k : n;
|
||||||
|
auto a_two = (a_rotated) ? n : k;
|
||||||
|
|
||||||
|
// Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
|
||||||
|
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
|
||||||
|
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
|
||||||
|
// space. Also tests that the leading dimensions of:
|
||||||
|
// matrix A cannot be less than N when rotated, or less than K when not-rotated
|
||||||
|
// matrix C cannot be less than N
|
||||||
|
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Calculates the ceiled versions of n and k
|
||||||
|
auto n_ceiled = Ceil(n, db_["NWG"]);
|
||||||
|
auto k_ceiled = Ceil(k, db_["KWG"]);
|
||||||
|
|
||||||
|
// Decides which kernel to run: the upper-triangular or lower-triangular version
|
||||||
|
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
|
||||||
|
|
||||||
|
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
|
||||||
|
try {
|
||||||
|
|
||||||
|
// Loads the program from the database
|
||||||
|
auto& program = GetProgramFromCache();
|
||||||
|
|
||||||
|
// Determines whether or not temporary matrices are needed
|
||||||
|
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
|
||||||
|
a_rotated == false && a_conjugate == false;
|
||||||
|
auto b_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
|
||||||
|
a_rotated == false && b_conjugate == false;
|
||||||
|
|
||||||
|
// Creates the temporary matrices
|
||||||
|
auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||||
|
auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||||
|
auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
|
||||||
|
|
||||||
|
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
|
||||||
|
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
|
||||||
|
// case nothing has to be done, these kernels can be skipped. Two copies are created.
|
||||||
|
if (!a_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
|
||||||
|
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
|
||||||
|
program, true, a_rotated, a_conjugate);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
if (!b_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
|
||||||
|
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
|
||||||
|
program, true, a_rotated, b_conjugate);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
|
||||||
|
// modify the other triangle.
|
||||||
|
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
|
||||||
|
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
|
||||||
|
program, true, c_rotated, false);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
|
||||||
|
try {
|
||||||
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
|
// Sets the kernel arguments
|
||||||
|
auto complex_alpha = T{alpha, static_cast<U>(0.0)};
|
||||||
|
auto complex_beta = T{beta, static_cast<U>(0.0)};
|
||||||
|
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||||
|
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||||
|
kernel.SetArgument(2, complex_alpha);
|
||||||
|
kernel.SetArgument(3, complex_beta);
|
||||||
|
kernel.SetArgument(4, a_temp());
|
||||||
|
kernel.SetArgument(5, b_temp());
|
||||||
|
kernel.SetArgument(6, c_temp());
|
||||||
|
|
||||||
|
// Computes the global and local thread sizes
|
||||||
|
auto global = std::vector<size_t>{
|
||||||
|
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
|
||||||
|
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
|
||||||
|
};
|
||||||
|
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
|
||||||
|
|
||||||
|
// Launches the kernel
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Runs the post-processing kernel
|
||||||
|
auto upper = (triangle == Triangle::kUpper);
|
||||||
|
auto lower = (triangle == Triangle::kLower);
|
||||||
|
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
|
||||||
|
n, n, c_ld, c_offset, c_buffer,
|
||||||
|
program, false, c_rotated, false, upper, lower, true);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Successfully finished the computation
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xherk<float2,float>;
|
||||||
|
template class Xherk<double2,double>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
|
@ -11,7 +11,7 @@
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
#include "internal/routines/xsymm.h"
|
#include "internal/routines/level3/xsymm.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -42,14 +42,14 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
|
||||||
|
|
||||||
// Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
|
// Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
|
||||||
// left) or B (on the right) in the Xgemm routine.
|
// left) or B (on the right) in the Xgemm routine.
|
||||||
size_t k = (side == Side::kLeft) ? m : n;
|
auto k = (side == Side::kLeft) ? m : n;
|
||||||
|
|
||||||
// Checks for validity of the squared A matrix
|
// Checks for validity of the squared A matrix
|
||||||
auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
|
auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
|
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
|
||||||
// default) and on whether we are dealing with an upper or lower triangle of the symmetrix matrix
|
// default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
|
||||||
bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
|
bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
|
||||||
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
|
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
|
||||||
auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
|
auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
|
||||||
|
@ -75,7 +75,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
|
||||||
kernel.SetArgument(7, temp_symm());
|
kernel.SetArgument(7, temp_symm());
|
||||||
|
|
||||||
// Uses the common padding kernel's thread configuration. This is allowed, since the
|
// Uses the common padding kernel's thread configuration. This is allowed, since the
|
||||||
// symmetry-to-squared kernel uses the same parameters.
|
// symmetric-to-squared kernel uses the same parameters.
|
||||||
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
|
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
|
||||||
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
|
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
|
||||||
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
|
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
|
186
src/routines/level3/xsyr2k.cc
Normal file
186
src/routines/level3/xsyr2k.cc
Normal file
|
@ -0,0 +1,186 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xsyr2k class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level3/xsyr2k.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specific implementations to get the memory-type based on a template argument
|
||||||
|
template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
|
||||||
|
template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
|
||||||
|
template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
|
||||||
|
template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
|
||||||
|
Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/copy.opencl"
|
||||||
|
#include "../../kernels/pad.opencl"
|
||||||
|
#include "../../kernels/transpose.opencl"
|
||||||
|
#include "../../kernels/padtranspose.opencl"
|
||||||
|
#include "../../kernels/xgemm.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
|
const T beta,
|
||||||
|
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
|
||||||
|
|
||||||
|
// Makes sure all dimensions are larger than zero
|
||||||
|
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// Computes whether or not the matrices are transposed in memory. This is based on their layout
|
||||||
|
// (row or column-major) and whether or not they are requested to be pre-transposed.
|
||||||
|
auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
|
||||||
|
(layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
|
||||||
|
auto c_rotated = (layout == Layout::kRowMajor);
|
||||||
|
|
||||||
|
// Computes the first and second dimensions of the A and B matrices taking the layout into account
|
||||||
|
auto ab_one = (ab_rotated) ? k : n;
|
||||||
|
auto ab_two = (ab_rotated) ? n : k;
|
||||||
|
|
||||||
|
// Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
|
||||||
|
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
|
||||||
|
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
|
||||||
|
// space. Also tests that the leading dimensions of:
|
||||||
|
// matrix A cannot be less than N when rotated, or less than K when not-rotated
|
||||||
|
// matrix B cannot be less than N when rotated, or less than K when not-rotated
|
||||||
|
// matrix C cannot be less than N
|
||||||
|
auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Calculates the ceiled versions of n and k
|
||||||
|
auto n_ceiled = Ceil(n, db_["NWG"]);
|
||||||
|
auto k_ceiled = Ceil(k, db_["KWG"]);
|
||||||
|
|
||||||
|
// Decides which kernel to run: the upper-triangular or lower-triangular version
|
||||||
|
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
|
||||||
|
|
||||||
|
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
|
||||||
|
try {
|
||||||
|
|
||||||
|
// Loads the program from the database
|
||||||
|
auto& program = GetProgramFromCache();
|
||||||
|
|
||||||
|
// Determines whether or not temporary matrices are needed
|
||||||
|
auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
|
||||||
|
ab_rotated == false;
|
||||||
|
auto b_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
|
||||||
|
ab_rotated == false;
|
||||||
|
|
||||||
|
// Creates the temporary matrices
|
||||||
|
auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||||
|
auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||||
|
auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
|
||||||
|
|
||||||
|
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
|
||||||
|
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
|
||||||
|
// case nothing has to be done, these kernels can be skipped.
|
||||||
|
if (!a_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
|
||||||
|
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
|
||||||
|
program, true, ab_rotated, false);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
if (!b_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
|
||||||
|
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
|
||||||
|
program, true, ab_rotated, false);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
|
||||||
|
// modify the other triangle.
|
||||||
|
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
|
||||||
|
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
|
||||||
|
program, true, c_rotated, false);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
|
||||||
|
try {
|
||||||
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
|
// Sets the kernel arguments
|
||||||
|
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||||
|
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||||
|
kernel.SetArgument(2, alpha);
|
||||||
|
kernel.SetArgument(3, beta);
|
||||||
|
kernel.SetArgument(4, a_temp());
|
||||||
|
kernel.SetArgument(5, b_temp());
|
||||||
|
kernel.SetArgument(6, c_temp());
|
||||||
|
|
||||||
|
// Computes the global and local thread sizes
|
||||||
|
auto global = std::vector<size_t>{
|
||||||
|
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
|
||||||
|
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
|
||||||
|
};
|
||||||
|
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
|
||||||
|
|
||||||
|
// Launches the kernel
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
|
||||||
|
auto one = static_cast<T>(1);
|
||||||
|
kernel.SetArgument(3, one);
|
||||||
|
kernel.SetArgument(4, b_temp());
|
||||||
|
kernel.SetArgument(5, a_temp());
|
||||||
|
|
||||||
|
// Runs the kernel again
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Runs the post-processing kernel
|
||||||
|
auto upper = (triangle == Triangle::kUpper);
|
||||||
|
auto lower = (triangle == Triangle::kLower);
|
||||||
|
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
|
||||||
|
n, n, c_ld, c_offset, c_buffer,
|
||||||
|
program, false, c_rotated, false, upper, lower, false);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Successfully finished the computation
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xsyr2k<float>;
|
||||||
|
template class Xsyr2k<double>;
|
||||||
|
template class Xsyr2k<float2>;
|
||||||
|
template class Xsyr2k<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
163
src/routines/level3/xsyrk.cc
Normal file
163
src/routines/level3/xsyrk.cc
Normal file
|
@ -0,0 +1,163 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xsyrk class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level3/xsyrk.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specific implementations to get the memory-type based on a template argument
|
||||||
|
template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
|
||||||
|
template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
|
||||||
|
template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
|
||||||
|
template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
|
||||||
|
Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/copy.opencl"
|
||||||
|
#include "../../kernels/pad.opencl"
|
||||||
|
#include "../../kernels/transpose.opencl"
|
||||||
|
#include "../../kernels/padtranspose.opencl"
|
||||||
|
#include "../../kernels/xgemm.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||||
|
const size_t n, const size_t k,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const T beta,
|
||||||
|
const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
|
||||||
|
|
||||||
|
// Makes sure all dimensions are larger than zero
|
||||||
|
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// Computes whether or not the matrices are transposed in memory. This is based on their layout
|
||||||
|
// (row or column-major) and whether or not they are requested to be pre-transposed.
|
||||||
|
auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
|
||||||
|
(layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
|
||||||
|
auto c_rotated = (layout == Layout::kRowMajor);
|
||||||
|
|
||||||
|
// Computes the first and second dimensions of the A matrix taking the layout into account
|
||||||
|
auto a_one = (a_rotated) ? k : n;
|
||||||
|
auto a_two = (a_rotated) ? n : k;
|
||||||
|
|
||||||
|
// Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
|
||||||
|
// their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
|
||||||
|
// OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
|
||||||
|
// space. Also tests that the leading dimensions of:
|
||||||
|
// matrix A cannot be less than N when rotated, or less than K when not-rotated
|
||||||
|
// matrix C cannot be less than N
|
||||||
|
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Calculates the ceiled versions of n and k
|
||||||
|
auto n_ceiled = Ceil(n, db_["NWG"]);
|
||||||
|
auto k_ceiled = Ceil(k, db_["KWG"]);
|
||||||
|
|
||||||
|
// Decides which kernel to run: the upper-triangular or lower-triangular version
|
||||||
|
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
|
||||||
|
|
||||||
|
// The padded/transposed input/output matrices: if memory allocation fails, throw an exception
|
||||||
|
try {
|
||||||
|
|
||||||
|
// Loads the program from the database
|
||||||
|
auto& program = GetProgramFromCache();
|
||||||
|
|
||||||
|
// Determines whether or not temporary matrices are needed
|
||||||
|
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
|
||||||
|
a_rotated == false;
|
||||||
|
|
||||||
|
// Creates the temporary matrices
|
||||||
|
auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
|
||||||
|
auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
|
||||||
|
|
||||||
|
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
|
||||||
|
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
|
||||||
|
// case nothing has to be done, these kernels can be skipped.
|
||||||
|
if (!a_no_temp) {
|
||||||
|
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
|
||||||
|
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
|
||||||
|
program, true, a_rotated, false);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
|
||||||
|
// modify the other triangle.
|
||||||
|
status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
|
||||||
|
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
|
||||||
|
program, true, c_rotated, false);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
|
||||||
|
try {
|
||||||
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
|
// Sets the kernel arguments
|
||||||
|
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||||
|
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||||
|
kernel.SetArgument(2, alpha);
|
||||||
|
kernel.SetArgument(3, beta);
|
||||||
|
kernel.SetArgument(4, a_temp());
|
||||||
|
kernel.SetArgument(5, a_temp());
|
||||||
|
kernel.SetArgument(6, c_temp());
|
||||||
|
|
||||||
|
// Computes the global and local thread sizes
|
||||||
|
auto global = std::vector<size_t>{
|
||||||
|
(n_ceiled * db_["MDIMC"]) / db_["MWG"],
|
||||||
|
(n_ceiled * db_["NDIMC"]) / db_["NWG"]
|
||||||
|
};
|
||||||
|
auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
|
||||||
|
|
||||||
|
// Launches the kernel
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Runs the post-processing kernel
|
||||||
|
auto upper = (triangle == Triangle::kUpper);
|
||||||
|
auto lower = (triangle == Triangle::kLower);
|
||||||
|
status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
|
||||||
|
n, n, c_ld, c_offset, c_buffer,
|
||||||
|
program, false, c_rotated, false, upper, lower, false);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Successfully finished the computation
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xsyrk<float>;
|
||||||
|
template class Xsyrk<double>;
|
||||||
|
template class Xsyrk<float2>;
|
||||||
|
template class Xsyrk<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
135
src/routines/level3/xtrmm.cc
Normal file
135
src/routines/level3/xtrmm.cc
Normal file
|
@ -0,0 +1,135 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xtrmm class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level3/xtrmm.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event):
|
||||||
|
Xgemm<T>(queue, event) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
const Transpose a_transpose, const Diagonal diagonal,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) {
|
||||||
|
|
||||||
|
// Makes sure all dimensions are larger than zero
|
||||||
|
if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// Computes the k dimension. This is based on whether or not matrix is A (on the left)
|
||||||
|
// or B (on the right) in the Xgemm routine.
|
||||||
|
auto k = (side == Side::kLeft) ? m : n;
|
||||||
|
|
||||||
|
// Checks for validity of the triangular A matrix
|
||||||
|
auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
|
||||||
|
// default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
|
||||||
|
bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
|
||||||
|
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
|
||||||
|
auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
|
||||||
|
|
||||||
|
// Determines whether or not the triangular matrix is unit-diagonal
|
||||||
|
auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
|
||||||
|
|
||||||
|
// Temporary buffer for a copy of the triangular matrix
|
||||||
|
try {
|
||||||
|
auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
|
||||||
|
|
||||||
|
// Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
|
||||||
|
// routine afterwards
|
||||||
|
try {
|
||||||
|
auto& program = GetProgramFromCache();
|
||||||
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
|
// Sets the arguments for the triangular-to-squared kernel
|
||||||
|
kernel.SetArgument(0, static_cast<int>(k));
|
||||||
|
kernel.SetArgument(1, static_cast<int>(a_ld));
|
||||||
|
kernel.SetArgument(2, static_cast<int>(a_offset));
|
||||||
|
kernel.SetArgument(3, a_buffer());
|
||||||
|
kernel.SetArgument(4, static_cast<int>(k));
|
||||||
|
kernel.SetArgument(5, static_cast<int>(k));
|
||||||
|
kernel.SetArgument(6, static_cast<int>(0));
|
||||||
|
kernel.SetArgument(7, temp_triangular());
|
||||||
|
kernel.SetArgument(8, static_cast<int>(unit_diagonal));
|
||||||
|
|
||||||
|
// Uses the common padding kernel's thread configuration. This is allowed, since the
|
||||||
|
// triangular-to-squared kernel uses the same parameters.
|
||||||
|
auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
|
||||||
|
Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
|
||||||
|
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Runs the regular Xgemm code with either "B := alpha*A*B" or ...
|
||||||
|
if (side == Side::kLeft) {
|
||||||
|
status = DoGemm(layout, a_transpose, Transpose::kNo,
|
||||||
|
m, n, k,
|
||||||
|
alpha,
|
||||||
|
temp_triangular, 0, k,
|
||||||
|
b_buffer, b_offset, b_ld,
|
||||||
|
static_cast<T>(0.0),
|
||||||
|
b_buffer, b_offset, b_ld);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... with "B := alpha*B*A". Note that A and B are now reversed.
|
||||||
|
else {
|
||||||
|
status = DoGemm(layout, Transpose::kNo, a_transpose,
|
||||||
|
m, n, k,
|
||||||
|
alpha,
|
||||||
|
b_buffer, b_offset, b_ld,
|
||||||
|
temp_triangular, 0, k,
|
||||||
|
static_cast<T>(0.0),
|
||||||
|
b_buffer, b_offset, b_ld);
|
||||||
|
|
||||||
|
// A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
|
||||||
|
switch(status) {
|
||||||
|
case StatusCode::kInvalidMatrixA: status = StatusCode::kInvalidMatrixB; break;
|
||||||
|
case StatusCode::kInvalidMatrixB: status = StatusCode::kInvalidMatrixA; break;
|
||||||
|
case StatusCode::kInvalidLeadDimA: status = StatusCode::kInvalidLeadDimB; break;
|
||||||
|
case StatusCode::kInvalidLeadDimB: status = StatusCode::kInvalidLeadDimA; break;
|
||||||
|
case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
|
||||||
|
case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the status of the Xgemm routine
|
||||||
|
return status;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
} catch (...) { return StatusCode::kTempBufferAllocFailure; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xtrmm<float>;
|
||||||
|
template class Xtrmm<double>;
|
||||||
|
template class Xtrmm<float2>;
|
||||||
|
template class Xtrmm<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
|
@ -30,11 +30,10 @@ void CopyTune(const Arguments<T> &args,
|
||||||
// This points to the CopyMatrix kernel as found in the CLBlast library. This is just one example
|
// This points to the CopyMatrix kernel as found in the CLBlast library. This is just one example
|
||||||
// of a copy kernel. However, all copy-kernels use the same tuning parameters, so one has to be
|
// of a copy kernel. However, all copy-kernels use the same tuning parameters, so one has to be
|
||||||
// chosen as a representative.
|
// chosen as a representative.
|
||||||
std::string common_source =
|
std::string sources =
|
||||||
#include "../src/kernels/common.opencl"
|
#include "../src/kernels/common.opencl"
|
||||||
std::string kernel_source =
|
#include "../src/kernels/copy.opencl"
|
||||||
#include "../src/kernels/copy.opencl"
|
;
|
||||||
auto sources = common_source + kernel_source;
|
|
||||||
auto id = tuner.AddKernelFromString(sources, "CopyMatrix", {args.m, args.n}, {1, 1});
|
auto id = tuner.AddKernelFromString(sources, "CopyMatrix", {args.m, args.n}, {1, 1});
|
||||||
tuner.SetReferenceFromString(sources, "CopyMatrix", {args.m, args.n}, {8, 8});
|
tuner.SetReferenceFromString(sources, "CopyMatrix", {args.m, args.n}, {8, 8});
|
||||||
|
|
||||||
|
|
|
@ -30,11 +30,10 @@ void PadTune(const Arguments<T> &args,
|
||||||
// This points to the PadMatrix kernel as found in the CLBlast library. This is just one
|
// This points to the PadMatrix kernel as found in the CLBlast library. This is just one
|
||||||
// example of a pad kernel. However, all pad-kernels use the same tuning parameters, so one has
|
// example of a pad kernel. However, all pad-kernels use the same tuning parameters, so one has
|
||||||
// to be chosen as a representative.
|
// to be chosen as a representative.
|
||||||
std::string common_source =
|
std::string sources =
|
||||||
#include "../src/kernels/common.opencl"
|
#include "../src/kernels/common.opencl"
|
||||||
std::string kernel_source =
|
#include "../src/kernels/pad.opencl"
|
||||||
#include "../src/kernels/pad.opencl"
|
;
|
||||||
auto sources = common_source + kernel_source;
|
|
||||||
auto id = tuner.AddKernelFromString(sources, "PadMatrix", {args.m, args.n}, {1, 1});
|
auto id = tuner.AddKernelFromString(sources, "PadMatrix", {args.m, args.n}, {1, 1});
|
||||||
tuner.SetReferenceFromString(sources, "PadMatrix", {args.m, args.n}, {8, 8});
|
tuner.SetReferenceFromString(sources, "PadMatrix", {args.m, args.n}, {8, 8});
|
||||||
|
|
||||||
|
|
|
@ -30,11 +30,10 @@ void PadTransposeTune(const Arguments<T> &args,
|
||||||
// This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
|
// This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
|
||||||
// example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
|
// example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
|
||||||
// to be chosen as a representative.
|
// to be chosen as a representative.
|
||||||
std::string common_source =
|
std::string sources =
|
||||||
#include "../src/kernels/common.opencl"
|
#include "../src/kernels/common.opencl"
|
||||||
std::string kernel_source =
|
#include "../src/kernels/padtranspose.opencl"
|
||||||
#include "../src/kernels/padtranspose.opencl"
|
;
|
||||||
auto sources = common_source + kernel_source;
|
|
||||||
auto id = tuner.AddKernelFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {1, 1});
|
auto id = tuner.AddKernelFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {1, 1});
|
||||||
tuner.SetReferenceFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {8, 8});
|
tuner.SetReferenceFromString(sources, "PadTransposeMatrix", {args.m, args.n}, {8, 8});
|
||||||
|
|
||||||
|
|
|
@ -30,11 +30,10 @@ void TransposeTune(const Arguments<T> &args,
|
||||||
// This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
|
// This points to the PadTransposeMatrix kernel as found in the CLBlast library. This is just one
|
||||||
// example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
|
// example of a transpose kernel. However, all kernels use the same tuning parameters, so one has
|
||||||
// to be chosen as a representative.
|
// to be chosen as a representative.
|
||||||
std::string common_source =
|
std::string sources =
|
||||||
#include "../src/kernels/common.opencl"
|
#include "../src/kernels/common.opencl"
|
||||||
std::string kernel_source =
|
#include "../src/kernels/transpose.opencl"
|
||||||
#include "../src/kernels/transpose.opencl"
|
;
|
||||||
auto sources = common_source + kernel_source;
|
|
||||||
auto id = tuner.AddKernelFromString(sources, "TransposeMatrix", {args.m, args.n}, {1, 1});
|
auto id = tuner.AddKernelFromString(sources, "TransposeMatrix", {args.m, args.n}, {1, 1});
|
||||||
tuner.SetReferenceFromString(sources, "TransposeMatrix", {args.m, args.n}, {8, 8});
|
tuner.SetReferenceFromString(sources, "TransposeMatrix", {args.m, args.n}, {8, 8});
|
||||||
|
|
||||||
|
@ -42,6 +41,7 @@ void TransposeTune(const Arguments<T> &args,
|
||||||
tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
|
tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64});
|
||||||
tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
|
tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16});
|
||||||
tuner.AddParameter(id, "TRA_PAD", {0, 1});
|
tuner.AddParameter(id, "TRA_PAD", {0, 1});
|
||||||
|
tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1});
|
||||||
|
|
||||||
// Tests for a specific precision
|
// Tests for a specific precision
|
||||||
tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
|
tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
|
||||||
|
|
|
@ -34,11 +34,10 @@ void XaxpyTune(const Arguments<T> &args,
|
||||||
}
|
}
|
||||||
|
|
||||||
// This points to the XaxpyFast kernel as found in the CLBlast library
|
// This points to the XaxpyFast kernel as found in the CLBlast library
|
||||||
std::string common_source =
|
std::string sources =
|
||||||
#include "../src/kernels/common.opencl"
|
#include "../src/kernels/common.opencl"
|
||||||
std::string kernel_source =
|
#include "../src/kernels/xaxpy.opencl"
|
||||||
#include "../src/kernels/xaxpy.opencl"
|
;
|
||||||
auto sources = common_source + kernel_source;
|
|
||||||
auto id = tuner.AddKernelFromString(sources, "XaxpyFast", {args.n}, {1});
|
auto id = tuner.AddKernelFromString(sources, "XaxpyFast", {args.n}, {1});
|
||||||
tuner.SetReferenceFromString(sources, "XaxpyFast", {args.n}, {64});
|
tuner.SetReferenceFromString(sources, "XaxpyFast", {args.n}, {64});
|
||||||
|
|
||||||
|
|
|
@ -30,11 +30,10 @@ void XgemmTune(const Arguments<T> &args,
|
||||||
cltune::Tuner &tuner) {
|
cltune::Tuner &tuner) {
|
||||||
|
|
||||||
// This points to the Xgemm kernel as found in the CLBlast library and its golden reference
|
// This points to the Xgemm kernel as found in the CLBlast library and its golden reference
|
||||||
std::string common_source =
|
std::string sources =
|
||||||
#include "../src/kernels/common.opencl"
|
#include "../src/kernels/common.opencl"
|
||||||
std::string kernel_source =
|
#include "../src/kernels/xgemm.opencl"
|
||||||
#include "../src/kernels/xgemm.opencl"
|
;
|
||||||
auto sources = common_source + kernel_source;
|
|
||||||
auto id = tuner.AddKernelFromString(sources, "Xgemm", {args.m, args.n}, {1, 1});
|
auto id = tuner.AddKernelFromString(sources, "Xgemm", {args.m, args.n}, {1, 1});
|
||||||
tuner.SetReferenceFromString(sources, "Xgemm", {args.m, args.n}, {8, 8});
|
tuner.SetReferenceFromString(sources, "Xgemm", {args.m, args.n}, {8, 8});
|
||||||
|
|
||||||
|
|
|
@ -36,11 +36,10 @@ void XgemvTune(const Arguments<T> &args, const size_t variation,
|
||||||
auto a_rotated = (variation == 3) ? 1 : 0;
|
auto a_rotated = (variation == 3) ? 1 : 0;
|
||||||
|
|
||||||
// This points to the Xgemv kernel as found in the CLBlast library
|
// This points to the Xgemv kernel as found in the CLBlast library
|
||||||
std::string common_source =
|
std::string sources =
|
||||||
#include "../src/kernels/common.opencl"
|
#include "../src/kernels/common.opencl"
|
||||||
std::string kernel_source =
|
#include "../src/kernels/xgemv.opencl"
|
||||||
#include "../src/kernels/xgemv.opencl"
|
;
|
||||||
auto sources = common_source + kernel_source;
|
|
||||||
auto id = tuner.AddKernelFromString(sources, kernel_name, {args.m}, {1});
|
auto id = tuner.AddKernelFromString(sources, kernel_name, {args.m}, {1});
|
||||||
tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
|
tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
|
||||||
|
|
||||||
|
|
|
@ -79,6 +79,13 @@ std::string ToString(Triangle value) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template <>
|
template <>
|
||||||
|
std::string ToString(Diagonal value) {
|
||||||
|
switch(value) {
|
||||||
|
case Diagonal::kUnit: return ToString(static_cast<int>(value))+" (unit)";
|
||||||
|
case Diagonal::kNonUnit: return ToString(static_cast<int>(value))+" (non-unit)";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template <>
|
||||||
std::string ToString(Precision value) {
|
std::string ToString(Precision value) {
|
||||||
switch(value) {
|
switch(value) {
|
||||||
case Precision::kHalf: return ToString(static_cast<int>(value))+" (half)";
|
case Precision::kHalf: return ToString(static_cast<int>(value))+" (half)";
|
||||||
|
@ -143,6 +150,7 @@ template Layout GetArgument<Layout>(const int, char **, std::string&, const std:
|
||||||
template Transpose GetArgument<Transpose>(const int, char **, std::string&, const std::string&, const Transpose);
|
template Transpose GetArgument<Transpose>(const int, char **, std::string&, const std::string&, const Transpose);
|
||||||
template Side GetArgument<Side>(const int, char **, std::string&, const std::string&, const Side);
|
template Side GetArgument<Side>(const int, char **, std::string&, const std::string&, const Side);
|
||||||
template Triangle GetArgument<Triangle>(const int, char **, std::string&, const std::string&, const Triangle);
|
template Triangle GetArgument<Triangle>(const int, char **, std::string&, const std::string&, const Triangle);
|
||||||
|
template Diagonal GetArgument<Diagonal>(const int, char **, std::string&, const std::string&, const Diagonal);
|
||||||
template Precision GetArgument<Precision>(const int, char **, std::string&, const std::string&, const Precision);
|
template Precision GetArgument<Precision>(const int, char **, std::string&, const std::string&, const Precision);
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
81
test/correctness/routines/level1/xaxpy.cc
Normal file
81
test/correctness/routines/level1/xaxpy.cc
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the tests for the Xaxpy routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
#include "routines/level1/xaxpy.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The correctness tester
|
||||||
|
template <typename T>
|
||||||
|
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Creates a tester
|
||||||
|
TestBlas<T,T> tester{argc, argv, silent, name, TestXaxpy<T>::GetOptions(),
|
||||||
|
TestXaxpy<T>::RunRoutine, TestXaxpy<T>::RunReference,
|
||||||
|
TestXaxpy<T>::DownloadResult, TestXaxpy<T>::GetResultIndex,
|
||||||
|
TestXaxpy<T>::ResultID1, TestXaxpy<T>::ResultID2};
|
||||||
|
|
||||||
|
// This variable holds the arguments relevant for this routine
|
||||||
|
auto args = Arguments<T>{};
|
||||||
|
|
||||||
|
// Creates the arguments vector for the regular tests
|
||||||
|
auto regular_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
for (auto &n: tester.kVectorDims) { args.n = n;
|
||||||
|
for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
|
||||||
|
for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
|
||||||
|
for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
|
||||||
|
for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
|
||||||
|
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
|
||||||
|
args.x_size = TestXaxpy<T>::GetSizeX(args);
|
||||||
|
args.y_size = TestXaxpy<T>::GetSizeY(args);
|
||||||
|
if (args.x_size<1 || args.y_size<1) { continue; }
|
||||||
|
regular_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
auto invalid_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
args.n = tester.kBufferSize;
|
||||||
|
args.x_inc = args.y_inc = 1;
|
||||||
|
args.x_offset = args.y_offset = 0;
|
||||||
|
for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
|
||||||
|
for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
|
||||||
|
invalid_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the tests
|
||||||
|
const auto case_name = "default";
|
||||||
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
clblast::RunTest<float>(argc, argv, false, "SAXPY");
|
||||||
|
clblast::RunTest<double>(argc, argv, true, "DAXPY");
|
||||||
|
clblast::RunTest<clblast::float2>(argc, argv, true, "CAXPY");
|
||||||
|
clblast::RunTest<clblast::double2>(argc, argv, true, "ZAXPY");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
99
test/correctness/routines/level2/xgemv.cc
Normal file
99
test/correctness/routines/level2/xgemv.cc
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the tests for the Xgemv routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
#include "routines/level2/xgemv.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The correctness tester
|
||||||
|
template <typename T>
|
||||||
|
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Creates a tester
|
||||||
|
TestBlas<T,T> tester{argc, argv, silent, name, TestXgemv<T>::GetOptions(),
|
||||||
|
TestXgemv<T>::RunRoutine, TestXgemv<T>::RunReference,
|
||||||
|
TestXgemv<T>::DownloadResult, TestXgemv<T>::GetResultIndex,
|
||||||
|
TestXgemv<T>::ResultID1, TestXgemv<T>::ResultID2};
|
||||||
|
|
||||||
|
// This variable holds the arguments relevant for this routine
|
||||||
|
auto args = Arguments<T>{};
|
||||||
|
|
||||||
|
// Loops over the test-cases from a data-layout point of view
|
||||||
|
for (auto &layout: tester.kLayouts) { args.layout = layout;
|
||||||
|
for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
|
||||||
|
|
||||||
|
// Creates the arguments vector for the regular tests
|
||||||
|
auto regular_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
for (auto &m: tester.kMatrixVectorDims) { args.m = m;
|
||||||
|
for (auto &n: tester.kMatrixVectorDims) { args.n = n;
|
||||||
|
for (auto &a_ld: tester.kMatrixVectorDims) { args.a_ld = a_ld;
|
||||||
|
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
|
||||||
|
for (auto &x_inc: tester.kIncrements) { args.x_inc = x_inc;
|
||||||
|
for (auto &x_offset: tester.kOffsets) { args.x_offset = x_offset;
|
||||||
|
for (auto &y_inc: tester.kIncrements) { args.y_inc = y_inc;
|
||||||
|
for (auto &y_offset: tester.kOffsets) { args.y_offset = y_offset;
|
||||||
|
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
|
||||||
|
for (auto &beta: tester.kBetaValues) { args.beta = beta;
|
||||||
|
args.a_size = TestXgemv<T>::GetSizeA(args);
|
||||||
|
args.x_size = TestXgemv<T>::GetSizeX(args);
|
||||||
|
args.y_size = TestXgemv<T>::GetSizeY(args);
|
||||||
|
if (args.a_size<1 || args.x_size<1 || args.y_size<1) { continue; }
|
||||||
|
regular_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
auto invalid_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
args.m = args.n = tester.kBufferSize;
|
||||||
|
args.a_ld = tester.kBufferSize;
|
||||||
|
args.x_inc = args.y_inc = 1;
|
||||||
|
args.a_offset = args.x_offset = args.y_offset = 0;
|
||||||
|
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
|
||||||
|
for (auto &x_size: tester.kVecSizes) { args.x_size = x_size;
|
||||||
|
for (auto &y_size: tester.kVecSizes) { args.y_size = y_size;
|
||||||
|
invalid_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the tests
|
||||||
|
const auto case_name = ToString(layout)+" "+ToString(a_transpose);
|
||||||
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
clblast::RunTest<float>(argc, argv, false, "SGEMV");
|
||||||
|
clblast::RunTest<double>(argc, argv, true, "DGEMV");
|
||||||
|
clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMV");
|
||||||
|
clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMV");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
102
test/correctness/routines/level3/xgemm.cc
Normal file
102
test/correctness/routines/level3/xgemm.cc
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the tests for the Xgemm routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
#include "routines/level3/xgemm.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The correctness tester
|
||||||
|
template <typename T>
|
||||||
|
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Creates a tester
|
||||||
|
TestBlas<T,T> tester{argc, argv, silent, name, TestXgemm<T>::GetOptions(),
|
||||||
|
TestXgemm<T>::RunRoutine, TestXgemm<T>::RunReference,
|
||||||
|
TestXgemm<T>::DownloadResult, TestXgemm<T>::GetResultIndex,
|
||||||
|
TestXgemm<T>::ResultID1, TestXgemm<T>::ResultID2};
|
||||||
|
|
||||||
|
// This variable holds the arguments relevant for this routine
|
||||||
|
auto args = Arguments<T>{};
|
||||||
|
|
||||||
|
// Loops over the test-cases from a data-layout point of view
|
||||||
|
for (auto &layout: tester.kLayouts) { args.layout = layout;
|
||||||
|
for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
|
||||||
|
for (auto &b_transpose: tester.kTransposes) { args.b_transpose = b_transpose;
|
||||||
|
|
||||||
|
// Creates the arguments vector for the regular tests
|
||||||
|
auto regular_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
for (auto &m: tester.kMatrixDims) { args.m = m;
|
||||||
|
for (auto &n: tester.kMatrixDims) { args.n = n;
|
||||||
|
for (auto &k: tester.kMatrixDims) { args.k = k;
|
||||||
|
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
|
||||||
|
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
|
||||||
|
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
|
||||||
|
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
|
||||||
|
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
|
||||||
|
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
|
||||||
|
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
|
||||||
|
for (auto &beta: tester.kBetaValues) { args.beta = beta;
|
||||||
|
args.a_size = TestXgemm<T>::GetSizeA(args);
|
||||||
|
args.b_size = TestXgemm<T>::GetSizeB(args);
|
||||||
|
args.c_size = TestXgemm<T>::GetSizeC(args);
|
||||||
|
if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
|
||||||
|
regular_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
auto invalid_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
args.m = args.n = args.k = tester.kBufferSize;
|
||||||
|
args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
|
||||||
|
args.a_offset = args.b_offset = args.c_offset = 0;
|
||||||
|
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
|
||||||
|
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
|
||||||
|
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
|
||||||
|
invalid_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the tests
|
||||||
|
const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
|
||||||
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
clblast::RunTest<float>(argc, argv, false, "SGEMM");
|
||||||
|
clblast::RunTest<double>(argc, argv, true, "DGEMM");
|
||||||
|
clblast::RunTest<clblast::float2>(argc, argv, true, "CGEMM");
|
||||||
|
clblast::RunTest<clblast::double2>(argc, argv, true, "ZGEMM");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
98
test/correctness/routines/level3/xhemm.cc
Normal file
98
test/correctness/routines/level3/xhemm.cc
Normal file
|
@ -0,0 +1,98 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the tests for the Xhemm routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
#include "routines/level3/xhemm.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The correctness tester
|
||||||
|
template <typename T>
|
||||||
|
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Creates a tester
|
||||||
|
TestBlas<T,T> tester{argc, argv, silent, name, TestXhemm<T>::GetOptions(),
|
||||||
|
TestXhemm<T>::RunRoutine, TestXhemm<T>::RunReference,
|
||||||
|
TestXhemm<T>::DownloadResult, TestXhemm<T>::GetResultIndex,
|
||||||
|
TestXhemm<T>::ResultID1, TestXhemm<T>::ResultID2};
|
||||||
|
|
||||||
|
// This variable holds the arguments relevant for this routine
|
||||||
|
auto args = Arguments<T>{};
|
||||||
|
|
||||||
|
// Loops over the test-cases from a data-layout point of view
|
||||||
|
for (auto &layout: tester.kLayouts) { args.layout = layout;
|
||||||
|
for (auto &side: tester.kSides) { args.side = side;
|
||||||
|
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
|
||||||
|
|
||||||
|
// Creates the arguments vector for the regular tests
|
||||||
|
auto regular_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
for (auto &m: tester.kMatrixDims) { args.m = m;
|
||||||
|
for (auto &n: tester.kMatrixDims) { args.n = n;
|
||||||
|
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
|
||||||
|
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
|
||||||
|
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
|
||||||
|
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
|
||||||
|
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
|
||||||
|
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
|
||||||
|
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
|
||||||
|
for (auto &beta: tester.kBetaValues) { args.beta = beta;
|
||||||
|
args.a_size = TestXhemm<T>::GetSizeA(args);
|
||||||
|
args.b_size = TestXhemm<T>::GetSizeB(args);
|
||||||
|
args.c_size = TestXhemm<T>::GetSizeC(args);
|
||||||
|
if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
|
||||||
|
regular_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
auto invalid_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
args.m = args.n = tester.kBufferSize;
|
||||||
|
args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
|
||||||
|
args.a_offset = args.b_offset = args.c_offset = 0;
|
||||||
|
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
|
||||||
|
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
|
||||||
|
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
|
||||||
|
invalid_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the tests
|
||||||
|
const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
|
||||||
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
clblast::RunTest<clblast::float2>(argc, argv, true, "CHEMM");
|
||||||
|
clblast::RunTest<clblast::double2>(argc, argv, true, "ZHEMM");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
100
test/correctness/routines/level3/xher2k.cc
Normal file
100
test/correctness/routines/level3/xher2k.cc
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the tests for the Xher2k routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
#include "routines/level3/xher2k.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The correctness tester
|
||||||
|
template <typename T, typename U>
|
||||||
|
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Creates a tester
|
||||||
|
TestBlas<T,U> tester{argc, argv, silent, name, TestXher2k<T,U>::GetOptions(),
|
||||||
|
TestXher2k<T,U>::RunRoutine, TestXher2k<T,U>::RunReference,
|
||||||
|
TestXher2k<T,U>::DownloadResult, TestXher2k<T,U>::GetResultIndex,
|
||||||
|
TestXher2k<T,U>::ResultID1, TestXher2k<T,U>::ResultID2};
|
||||||
|
|
||||||
|
// This variable holds the arguments relevant for this routine
|
||||||
|
auto args = Arguments<U>{};
|
||||||
|
|
||||||
|
// Loops over the test-cases from a data-layout point of view
|
||||||
|
for (auto &layout: tester.kLayouts) { args.layout = layout;
|
||||||
|
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
|
||||||
|
for (auto &ab_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
|
||||||
|
args.a_transpose = ab_transpose; // valid BLAS option
|
||||||
|
args.b_transpose = ab_transpose;
|
||||||
|
|
||||||
|
// Creates the arguments vector for the regular tests
|
||||||
|
auto regular_test_vector = std::vector<Arguments<U>>{};
|
||||||
|
for (auto &n: tester.kMatrixDims) { args.n = n;
|
||||||
|
for (auto &k: tester.kMatrixDims) { args.k = k;
|
||||||
|
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
|
||||||
|
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
|
||||||
|
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
|
||||||
|
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
|
||||||
|
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
|
||||||
|
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
|
||||||
|
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
|
||||||
|
for (auto &beta: tester.kBetaValues) { args.beta = beta;
|
||||||
|
args.a_size = TestXher2k<T,U>::GetSizeA(args);
|
||||||
|
args.b_size = TestXher2k<T,U>::GetSizeB(args);
|
||||||
|
args.c_size = TestXher2k<T,U>::GetSizeC(args);
|
||||||
|
if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
|
||||||
|
regular_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
auto invalid_test_vector = std::vector<Arguments<U>>{};
|
||||||
|
args.n = args.k = tester.kBufferSize;
|
||||||
|
args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
|
||||||
|
args.a_offset = args.b_offset = args.c_offset = 0;
|
||||||
|
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
|
||||||
|
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
|
||||||
|
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
|
||||||
|
invalid_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the tests
|
||||||
|
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
|
||||||
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHER2K");
|
||||||
|
clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHER2K");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
92
test/correctness/routines/level3/xherk.cc
Normal file
92
test/correctness/routines/level3/xherk.cc
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the tests for the Xherk routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
#include "routines/level3/xherk.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The correctness tester
|
||||||
|
template <typename T, typename U>
|
||||||
|
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Creates a tester
|
||||||
|
TestBlas<T,U> tester{argc, argv, silent, name, TestXherk<T,U>::GetOptions(),
|
||||||
|
TestXherk<T,U>::RunRoutine, TestXherk<T,U>::RunReference,
|
||||||
|
TestXherk<T,U>::DownloadResult, TestXherk<T,U>::GetResultIndex,
|
||||||
|
TestXherk<T,U>::ResultID1, TestXherk<T,U>::ResultID2};
|
||||||
|
|
||||||
|
// This variable holds the arguments relevant for this routine
|
||||||
|
auto args = Arguments<U>{};
|
||||||
|
|
||||||
|
// Loops over the test-cases from a data-layout point of view
|
||||||
|
for (auto &layout: tester.kLayouts) { args.layout = layout;
|
||||||
|
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
|
||||||
|
for (auto &a_transpose: {Transpose::kNo, Transpose::kConjugate}) { // Regular transpose not a
|
||||||
|
args.a_transpose = a_transpose; // valid BLAS option
|
||||||
|
|
||||||
|
// Creates the arguments vector for the regular tests
|
||||||
|
auto regular_test_vector = std::vector<Arguments<U>>{};
|
||||||
|
for (auto &n: tester.kMatrixDims) { args.n = n;
|
||||||
|
for (auto &k: tester.kMatrixDims) { args.k = k;
|
||||||
|
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
|
||||||
|
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
|
||||||
|
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
|
||||||
|
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
|
||||||
|
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
|
||||||
|
for (auto &beta: tester.kBetaValues) { args.beta = beta;
|
||||||
|
args.a_size = TestXherk<T,U>::GetSizeA(args);
|
||||||
|
args.c_size = TestXherk<T,U>::GetSizeC(args);
|
||||||
|
if (args.a_size<1 || args.c_size<1) { continue; }
|
||||||
|
regular_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
auto invalid_test_vector = std::vector<Arguments<U>>{};
|
||||||
|
args.n = args.k = tester.kBufferSize;
|
||||||
|
args.a_ld = args.c_ld = tester.kBufferSize;
|
||||||
|
args.a_offset = args.c_offset = 0;
|
||||||
|
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
|
||||||
|
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
|
||||||
|
invalid_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the tests
|
||||||
|
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
|
||||||
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
clblast::RunTest<clblast::float2,float>(argc, argv, false, "CHERK");
|
||||||
|
clblast::RunTest<clblast::double2,double>(argc, argv, true, "ZHERK");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
100
test/correctness/routines/level3/xsymm.cc
Normal file
100
test/correctness/routines/level3/xsymm.cc
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the tests for the Xsymm routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
#include "routines/level3/xsymm.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The correctness tester
|
||||||
|
template <typename T>
|
||||||
|
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Creates a tester
|
||||||
|
TestBlas<T,T> tester{argc, argv, silent, name, TestXsymm<T>::GetOptions(),
|
||||||
|
TestXsymm<T>::RunRoutine, TestXsymm<T>::RunReference,
|
||||||
|
TestXsymm<T>::DownloadResult, TestXsymm<T>::GetResultIndex,
|
||||||
|
TestXsymm<T>::ResultID1, TestXsymm<T>::ResultID2};
|
||||||
|
|
||||||
|
// This variable holds the arguments relevant for this routine
|
||||||
|
auto args = Arguments<T>{};
|
||||||
|
|
||||||
|
// Loops over the test-cases from a data-layout point of view
|
||||||
|
for (auto &layout: tester.kLayouts) { args.layout = layout;
|
||||||
|
for (auto &side: tester.kSides) { args.side = side;
|
||||||
|
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
|
||||||
|
|
||||||
|
// Creates the arguments vector for the regular tests
|
||||||
|
auto regular_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
for (auto &m: tester.kMatrixDims) { args.m = m;
|
||||||
|
for (auto &n: tester.kMatrixDims) { args.n = n;
|
||||||
|
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
|
||||||
|
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
|
||||||
|
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
|
||||||
|
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
|
||||||
|
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
|
||||||
|
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
|
||||||
|
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
|
||||||
|
for (auto &beta: tester.kBetaValues) { args.beta = beta;
|
||||||
|
args.a_size = TestXsymm<T>::GetSizeA(args);
|
||||||
|
args.b_size = TestXsymm<T>::GetSizeB(args);
|
||||||
|
args.c_size = TestXsymm<T>::GetSizeC(args);
|
||||||
|
if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
|
||||||
|
regular_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
auto invalid_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
args.m = args.n = tester.kBufferSize;
|
||||||
|
args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
|
||||||
|
args.a_offset = args.b_offset = args.c_offset = 0;
|
||||||
|
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
|
||||||
|
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
|
||||||
|
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
|
||||||
|
invalid_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the tests
|
||||||
|
const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
|
||||||
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
clblast::RunTest<float>(argc, argv, false, "SSYMM");
|
||||||
|
clblast::RunTest<double>(argc, argv, true, "DSYMM");
|
||||||
|
clblast::RunTest<clblast::float2>(argc, argv, true, "CSYMM");
|
||||||
|
clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYMM");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
102
test/correctness/routines/level3/xsyr2k.cc
Normal file
102
test/correctness/routines/level3/xsyr2k.cc
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the tests for the Xsyr2k routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
#include "routines/level3/xsyr2k.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The correctness tester
|
||||||
|
template <typename T>
|
||||||
|
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Creates a tester
|
||||||
|
TestBlas<T,T> tester{argc, argv, silent, name, TestXsyr2k<T>::GetOptions(),
|
||||||
|
TestXsyr2k<T>::RunRoutine, TestXsyr2k<T>::RunReference,
|
||||||
|
TestXsyr2k<T>::DownloadResult, TestXsyr2k<T>::GetResultIndex,
|
||||||
|
TestXsyr2k<T>::ResultID1, TestXsyr2k<T>::ResultID2};
|
||||||
|
|
||||||
|
// This variable holds the arguments relevant for this routine
|
||||||
|
auto args = Arguments<T>{};
|
||||||
|
|
||||||
|
// Loops over the test-cases from a data-layout point of view
|
||||||
|
for (auto &layout: tester.kLayouts) { args.layout = layout;
|
||||||
|
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
|
||||||
|
for (auto &ab_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
|
||||||
|
args.a_transpose = ab_transpose; // is not supported by clBLAS
|
||||||
|
args.b_transpose = ab_transpose;
|
||||||
|
|
||||||
|
// Creates the arguments vector for the regular tests
|
||||||
|
auto regular_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
for (auto &n: tester.kMatrixDims) { args.n = n;
|
||||||
|
for (auto &k: tester.kMatrixDims) { args.k = k;
|
||||||
|
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
|
||||||
|
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
|
||||||
|
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
|
||||||
|
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
|
||||||
|
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
|
||||||
|
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
|
||||||
|
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
|
||||||
|
for (auto &beta: tester.kBetaValues) { args.beta = beta;
|
||||||
|
args.a_size = TestXsyr2k<T>::GetSizeA(args);
|
||||||
|
args.b_size = TestXsyr2k<T>::GetSizeB(args);
|
||||||
|
args.c_size = TestXsyr2k<T>::GetSizeC(args);
|
||||||
|
if (args.a_size<1 || args.b_size<1 || args.c_size<1) { continue; }
|
||||||
|
regular_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
auto invalid_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
args.n = args.k = tester.kBufferSize;
|
||||||
|
args.a_ld = args.b_ld = args.c_ld = tester.kBufferSize;
|
||||||
|
args.a_offset = args.b_offset = args.c_offset = 0;
|
||||||
|
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
|
||||||
|
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
|
||||||
|
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
|
||||||
|
invalid_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the tests
|
||||||
|
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(ab_transpose);
|
||||||
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
clblast::RunTest<float>(argc, argv, false, "SSYR2K");
|
||||||
|
clblast::RunTest<double>(argc, argv, true, "DSYR2K");
|
||||||
|
clblast::RunTest<clblast::float2>(argc, argv, true, "CSYR2K");
|
||||||
|
clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYR2K");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
94
test/correctness/routines/level3/xsyrk.cc
Normal file
94
test/correctness/routines/level3/xsyrk.cc
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the tests for the Xsyrk routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
#include "routines/level3/xsyrk.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The correctness tester
|
||||||
|
template <typename T>
|
||||||
|
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Creates a tester
|
||||||
|
TestBlas<T,T> tester{argc, argv, silent, name, TestXsyrk<T>::GetOptions(),
|
||||||
|
TestXsyrk<T>::RunRoutine, TestXsyrk<T>::RunReference,
|
||||||
|
TestXsyrk<T>::DownloadResult, TestXsyrk<T>::GetResultIndex,
|
||||||
|
TestXsyrk<T>::ResultID1, TestXsyrk<T>::ResultID2};
|
||||||
|
|
||||||
|
// This variable holds the arguments relevant for this routine
|
||||||
|
auto args = Arguments<T>{};
|
||||||
|
|
||||||
|
// Loops over the test-cases from a data-layout point of view
|
||||||
|
for (auto &layout: tester.kLayouts) { args.layout = layout;
|
||||||
|
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
|
||||||
|
for (auto &a_transpose: {Transpose::kNo, Transpose::kYes}) { // No conjugate here since it
|
||||||
|
args.a_transpose = a_transpose; // is not supported by clBLAS
|
||||||
|
|
||||||
|
// Creates the arguments vector for the regular tests
|
||||||
|
auto regular_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
for (auto &n: tester.kMatrixDims) { args.n = n;
|
||||||
|
for (auto &k: tester.kMatrixDims) { args.k = k;
|
||||||
|
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
|
||||||
|
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
|
||||||
|
for (auto &c_ld: tester.kMatrixDims) { args.c_ld = c_ld;
|
||||||
|
for (auto &c_offset: tester.kOffsets) { args.c_offset = c_offset;
|
||||||
|
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
|
||||||
|
for (auto &beta: tester.kBetaValues) { args.beta = beta;
|
||||||
|
args.a_size = TestXsyrk<T>::GetSizeA(args);
|
||||||
|
args.c_size = TestXsyrk<T>::GetSizeC(args);
|
||||||
|
if (args.a_size<1 || args.c_size<1) { continue; }
|
||||||
|
regular_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
auto invalid_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
args.n = args.k = tester.kBufferSize;
|
||||||
|
args.a_ld = args.c_ld = tester.kBufferSize;
|
||||||
|
args.a_offset = args.c_offset = 0;
|
||||||
|
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
|
||||||
|
for (auto &c_size: tester.kMatSizes) { args.c_size = c_size;
|
||||||
|
invalid_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the tests
|
||||||
|
const auto case_name = ToString(layout)+" "+ToString(triangle)+" "+ToString(a_transpose);
|
||||||
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
clblast::RunTest<float>(argc, argv, false, "SSYRK");
|
||||||
|
clblast::RunTest<double>(argc, argv, true, "DSYRK");
|
||||||
|
clblast::RunTest<clblast::float2>(argc, argv, true, "CSYRK");
|
||||||
|
clblast::RunTest<clblast::double2>(argc, argv, true, "ZSYRK");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
96
test/correctness/routines/level3/xtrmm.cc
Normal file
96
test/correctness/routines/level3/xtrmm.cc
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the tests for the Xtrmm routine.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
#include "routines/level3/xtrmm.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The correctness tester
|
||||||
|
template <typename T>
|
||||||
|
void RunTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
||||||
|
|
||||||
|
// Creates a tester
|
||||||
|
TestBlas<T,T> tester{argc, argv, silent, name, TestXtrmm<T>::GetOptions(),
|
||||||
|
TestXtrmm<T>::RunRoutine, TestXtrmm<T>::RunReference,
|
||||||
|
TestXtrmm<T>::DownloadResult, TestXtrmm<T>::GetResultIndex,
|
||||||
|
TestXtrmm<T>::ResultID1, TestXtrmm<T>::ResultID2};
|
||||||
|
|
||||||
|
// This variable holds the arguments relevant for this routine
|
||||||
|
auto args = Arguments<T>{};
|
||||||
|
|
||||||
|
// Loops over the test-cases from a data-layout point of view
|
||||||
|
for (auto &layout: tester.kLayouts) { args.layout = layout;
|
||||||
|
for (auto &side: tester.kSides) { args.side = side;
|
||||||
|
for (auto &triangle: tester.kTriangles) { args.triangle = triangle;
|
||||||
|
for (auto &a_transpose: tester.kTransposes) { args.a_transpose = a_transpose;
|
||||||
|
for (auto &diagonal: tester.kDiagonals) { args.diagonal = diagonal;
|
||||||
|
|
||||||
|
// Creates the arguments vector for the regular tests
|
||||||
|
auto regular_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
for (auto &m: tester.kMatrixDims) { args.m = m;
|
||||||
|
for (auto &n: tester.kMatrixDims) { args.n = n;
|
||||||
|
for (auto &a_ld: tester.kMatrixDims) { args.a_ld = a_ld;
|
||||||
|
for (auto &a_offset: tester.kOffsets) { args.a_offset = a_offset;
|
||||||
|
for (auto &b_ld: tester.kMatrixDims) { args.b_ld = b_ld;
|
||||||
|
for (auto &b_offset: tester.kOffsets) { args.b_offset = b_offset;
|
||||||
|
for (auto &alpha: tester.kAlphaValues) { args.alpha = alpha;
|
||||||
|
args.a_size = TestXtrmm<T>::GetSizeA(args);
|
||||||
|
args.b_size = TestXtrmm<T>::GetSizeB(args);
|
||||||
|
if (args.a_size<1 || args.b_size<1) { continue; }
|
||||||
|
regular_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates the arguments vector for the invalid-buffer tests
|
||||||
|
auto invalid_test_vector = std::vector<Arguments<T>>{};
|
||||||
|
args.m = args.n = tester.kBufferSize;
|
||||||
|
args.a_ld = args.b_ld = tester.kBufferSize;
|
||||||
|
args.a_offset = args.b_offset = 0;
|
||||||
|
for (auto &a_size: tester.kMatSizes) { args.a_size = a_size;
|
||||||
|
for (auto &b_size: tester.kMatSizes) { args.b_size = b_size;
|
||||||
|
invalid_test_vector.push_back(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the tests
|
||||||
|
const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle)+" "+
|
||||||
|
ToString(a_transpose)+" "+ToString(diagonal);
|
||||||
|
tester.TestRegular(regular_test_vector, case_name);
|
||||||
|
tester.TestInvalid(invalid_test_vector, case_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
clblast::RunTest<float>(argc, argv, false, "STRMM");
|
||||||
|
clblast::RunTest<double>(argc, argv, true, "DTRMM");
|
||||||
|
clblast::RunTest<clblast::float2>(argc, argv, true, "CTRMM");
|
||||||
|
clblast::RunTest<clblast::double2>(argc, argv, true, "ZTRMM");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
|
@ -1,75 +0,0 @@
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
|
||||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
||||||
// width of 100 characters per line.
|
|
||||||
//
|
|
||||||
// Author(s):
|
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
||||||
//
|
|
||||||
// This file implements the tests for the Xaxpy routine. It is based on the TestXY class.
|
|
||||||
//
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
|
||||||
#include "correctness/testxy.h"
|
|
||||||
|
|
||||||
namespace clblast {
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
|
|
||||||
template <typename T>
|
|
||||||
void XaxpyTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
|
||||||
|
|
||||||
// Creates the CLBlast lambda
|
|
||||||
auto clblast_lambda = [](const Arguments<T> &args,
|
|
||||||
const Buffer &x_vec, const Buffer &y_vec,
|
|
||||||
CommandQueue &queue) -> StatusCode {
|
|
||||||
auto queue_plain = queue();
|
|
||||||
auto event = cl_event{};
|
|
||||||
return Axpy(args.n, args.alpha,
|
|
||||||
x_vec(), args.x_offset, args.x_inc,
|
|
||||||
y_vec(), args.y_offset, args.y_inc,
|
|
||||||
&queue_plain, &event);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Creates the clBLAS lambda (for comparison)
|
|
||||||
auto clblas_lambda = [](const Arguments<T> &args,
|
|
||||||
const Buffer &x_vec, const Buffer &y_vec,
|
|
||||||
CommandQueue &queue) -> StatusCode {
|
|
||||||
auto queue_plain = queue();
|
|
||||||
auto event = cl_event{};
|
|
||||||
auto status = clblasXaxpy(args.n, args.alpha,
|
|
||||||
x_vec(), args.x_offset, args.x_inc,
|
|
||||||
y_vec(), args.y_offset, args.y_inc,
|
|
||||||
1, &queue_plain, 0, nullptr, &event);
|
|
||||||
return static_cast<StatusCode>(status);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Initializes the arguments relevant for this routine
|
|
||||||
auto args = Arguments<T>{};
|
|
||||||
const auto options = std::vector<std::string>{kArgN, kArgXInc, kArgYInc,
|
|
||||||
kArgXOffset, kArgYOffset, kArgAlpha};
|
|
||||||
|
|
||||||
// Creates a tester
|
|
||||||
TestXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
|
|
||||||
|
|
||||||
// Runs the tests
|
|
||||||
const auto case_name = "default";
|
|
||||||
tester.TestRegular(args, case_name);
|
|
||||||
tester.TestInvalidBufferSizes(args, case_name);
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
} // namespace clblast
|
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
clblast::XaxpyTest<float>(argc, argv, false, "SAXPY");
|
|
||||||
clblast::XaxpyTest<double>(argc, argv, true, "DAXPY");
|
|
||||||
clblast::XaxpyTest<clblast::float2>(argc, argv, true, "CAXPY");
|
|
||||||
clblast::XaxpyTest<clblast::double2>(argc, argv, true, "ZAXPY");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
|
@ -1,98 +0,0 @@
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
|
||||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
||||||
// width of 100 characters per line.
|
|
||||||
//
|
|
||||||
// Author(s):
|
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
||||||
//
|
|
||||||
// This file implements the tests for the Xgemm routine. It is based on the TestABC class.
|
|
||||||
//
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
|
||||||
#include "correctness/testabc.h"
|
|
||||||
|
|
||||||
namespace clblast {
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
|
|
||||||
template <typename T>
|
|
||||||
void XgemmTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
|
||||||
|
|
||||||
// Creates the CLBlast lambda
|
|
||||||
auto clblast_lambda = [](const Arguments<T> &args,
|
|
||||||
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
|
|
||||||
CommandQueue &queue) -> StatusCode {
|
|
||||||
auto queue_plain = queue();
|
|
||||||
auto event = cl_event{};
|
|
||||||
return Gemm(args.layout, args.a_transpose, args.b_transpose,
|
|
||||||
args.m, args.n, args.k,
|
|
||||||
args.alpha,
|
|
||||||
a_mat(), args.a_offset, args.a_ld,
|
|
||||||
b_mat(), args.b_offset, args.b_ld,
|
|
||||||
args.beta,
|
|
||||||
c_mat(), args.c_offset, args.c_ld,
|
|
||||||
&queue_plain, &event);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Creates the clBLAS lambda (for comparison)
|
|
||||||
auto clblas_lambda = [](const Arguments<T> &args,
|
|
||||||
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
|
|
||||||
CommandQueue &queue) -> StatusCode {
|
|
||||||
auto queue_plain = queue();
|
|
||||||
auto event = cl_event{};
|
|
||||||
auto status = clblasXgemm(static_cast<clblasOrder>(args.layout),
|
|
||||||
static_cast<clblasTranspose>(args.a_transpose),
|
|
||||||
static_cast<clblasTranspose>(args.b_transpose),
|
|
||||||
args.m, args.n, args.k,
|
|
||||||
args.alpha,
|
|
||||||
a_mat(), args.a_offset, args.a_ld,
|
|
||||||
b_mat(), args.b_offset, args.b_ld,
|
|
||||||
args.beta,
|
|
||||||
c_mat(), args.c_offset, args.c_ld,
|
|
||||||
1, &queue_plain, 0, nullptr, &event);
|
|
||||||
return static_cast<StatusCode>(status);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Initializes the arguments relevant for this routine
|
|
||||||
auto args = Arguments<T>{};
|
|
||||||
const auto options = std::vector<std::string>{kArgM, kArgN, kArgK, kArgLayout,
|
|
||||||
kArgATransp, kArgBTransp,
|
|
||||||
kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
|
|
||||||
kArgAOffset, kArgBOffset, kArgCOffset};
|
|
||||||
|
|
||||||
// Creates a tester
|
|
||||||
TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
|
|
||||||
|
|
||||||
// Loops over the test-cases from a data-layout point of view
|
|
||||||
for (auto &layout: tester.kLayouts) {
|
|
||||||
args.layout = layout;
|
|
||||||
for (auto &a_transpose: tester.kTransposes) {
|
|
||||||
args.a_transpose = a_transpose;
|
|
||||||
for (auto &b_transpose: tester.kTransposes) {
|
|
||||||
args.b_transpose = b_transpose;
|
|
||||||
const auto case_name = ToString(layout)+" "+ToString(a_transpose)+" "+ToString(b_transpose);
|
|
||||||
|
|
||||||
// Runs the tests
|
|
||||||
tester.TestRegular(args, case_name);
|
|
||||||
tester.TestInvalidBufferSizes(args, case_name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
} // namespace clblast
|
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
clblast::XgemmTest<float>(argc, argv, false, "SGEMM");
|
|
||||||
clblast::XgemmTest<double>(argc, argv, true, "DGEMM");
|
|
||||||
clblast::XgemmTest<clblast::float2>(argc, argv, true, "CGEMM");
|
|
||||||
clblast::XgemmTest<clblast::double2>(argc, argv, true, "ZGEMM");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
|
@ -1,88 +0,0 @@
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
|
||||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
||||||
// width of 100 characters per line.
|
|
||||||
//
|
|
||||||
// Author(s):
|
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
||||||
//
|
|
||||||
// This file implements the tests for the Xgemv routine. It is based on the TestAXY class.
|
|
||||||
//
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
|
||||||
#include "correctness/testaxy.h"
|
|
||||||
|
|
||||||
namespace clblast {
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
|
|
||||||
template <typename T>
|
|
||||||
void XgemvTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
|
||||||
|
|
||||||
// Creates the CLBlast lambda
|
|
||||||
auto clblast_lambda = [](const Arguments<T> &args,
|
|
||||||
const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
|
|
||||||
CommandQueue &queue) -> StatusCode {
|
|
||||||
auto queue_plain = queue();
|
|
||||||
auto event = cl_event{};
|
|
||||||
return Gemv(args.layout, args.a_transpose, args.m, args.n, args.alpha,
|
|
||||||
a_mat(), args.a_offset, args.a_ld,
|
|
||||||
x_vec(), args.x_offset, args.x_inc, args.beta,
|
|
||||||
y_vec(), args.y_offset, args.y_inc,
|
|
||||||
&queue_plain, &event);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Creates the clBLAS lambda (for comparison)
|
|
||||||
auto clblas_lambda = [](const Arguments<T> &args,
|
|
||||||
const Buffer &a_mat, const Buffer &x_vec, const Buffer &y_vec,
|
|
||||||
CommandQueue &queue) -> StatusCode {
|
|
||||||
auto queue_plain = queue();
|
|
||||||
auto event = cl_event{};
|
|
||||||
auto status = clblasXgemv(static_cast<clblasOrder>(args.layout),
|
|
||||||
static_cast<clblasTranspose>(args.a_transpose),
|
|
||||||
args.m, args.n, args.alpha,
|
|
||||||
a_mat(), args.a_offset, args.a_ld,
|
|
||||||
x_vec(), args.x_offset, args.x_inc, args.beta,
|
|
||||||
y_vec(), args.y_offset, args.y_inc,
|
|
||||||
1, &queue_plain, 0, nullptr, &event);
|
|
||||||
return static_cast<StatusCode>(status);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Initializes the arguments relevant for this routine
|
|
||||||
auto args = Arguments<T>{};
|
|
||||||
const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout, kArgATransp,
|
|
||||||
kArgALeadDim, kArgXInc, kArgYInc,
|
|
||||||
kArgAOffset, kArgXOffset, kArgYOffset};
|
|
||||||
|
|
||||||
// Creates a tester
|
|
||||||
TestAXY<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
|
|
||||||
|
|
||||||
// Loops over the test-cases from a data-layout point of view
|
|
||||||
for (auto &layout: tester.kLayouts) {
|
|
||||||
args.layout = layout;
|
|
||||||
for (auto &a_transpose: tester.kTransposes) {
|
|
||||||
args.a_transpose = a_transpose;
|
|
||||||
const auto case_name = ToString(layout)+" "+ToString(a_transpose);
|
|
||||||
|
|
||||||
// Runs the tests
|
|
||||||
tester.TestRegular(args, case_name);
|
|
||||||
tester.TestInvalidBufferSizes(args, case_name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
} // namespace clblast
|
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
clblast::XgemvTest<float>(argc, argv, false, "SGEMV");
|
|
||||||
clblast::XgemvTest<double>(argc, argv, true, "DGEMV");
|
|
||||||
clblast::XgemvTest<clblast::float2>(argc, argv, true, "CGEMV");
|
|
||||||
clblast::XgemvTest<clblast::double2>(argc, argv, true, "ZGEMV");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
|
@ -1,98 +0,0 @@
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
|
||||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
||||||
// width of 100 characters per line.
|
|
||||||
//
|
|
||||||
// Author(s):
|
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
||||||
//
|
|
||||||
// This file implements the tests for the Xsymm routine. It is based on the TestABC class.
|
|
||||||
//
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
#include "wrapper_clblas.h"
|
|
||||||
#include "correctness/testabc.h"
|
|
||||||
|
|
||||||
namespace clblast {
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// The correctness tester, containing the function calls to CLBlast and to clBLAS for comparison.
|
|
||||||
template <typename T>
|
|
||||||
void XsymmTest(int argc, char *argv[], const bool silent, const std::string &name) {
|
|
||||||
|
|
||||||
// Creates the CLBlast lambda
|
|
||||||
auto clblast_lambda = [](const Arguments<T> &args,
|
|
||||||
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
|
|
||||||
CommandQueue &queue) -> StatusCode {
|
|
||||||
auto queue_plain = queue();
|
|
||||||
auto event = cl_event{};
|
|
||||||
return Symm(args.layout, args.side, args.triangle,
|
|
||||||
args.m, args.n,
|
|
||||||
args.alpha,
|
|
||||||
a_mat(), args.a_offset, args.a_ld,
|
|
||||||
b_mat(), args.b_offset, args.b_ld,
|
|
||||||
args.beta,
|
|
||||||
c_mat(), args.c_offset, args.c_ld,
|
|
||||||
&queue_plain, &event);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Creates the clBLAS lambda (for comparison)
|
|
||||||
auto clblas_lambda = [](const Arguments<T> &args,
|
|
||||||
const Buffer &a_mat, const Buffer &b_mat, const Buffer &c_mat,
|
|
||||||
CommandQueue &queue) -> StatusCode {
|
|
||||||
auto queue_plain = queue();
|
|
||||||
auto event = cl_event{};
|
|
||||||
auto status = clblasXsymm(static_cast<clblasOrder>(args.layout),
|
|
||||||
static_cast<clblasSide>(args.side),
|
|
||||||
static_cast<clblasUplo>(args.triangle),
|
|
||||||
args.m, args.n,
|
|
||||||
args.alpha,
|
|
||||||
a_mat(), args.a_offset, args.a_ld,
|
|
||||||
b_mat(), args.b_offset, args.b_ld,
|
|
||||||
args.beta,
|
|
||||||
c_mat(), args.c_offset, args.c_ld,
|
|
||||||
1, &queue_plain, 0, nullptr, &event);
|
|
||||||
return static_cast<StatusCode>(status);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Initializes the arguments relevant for this routine
|
|
||||||
auto args = Arguments<T>{};
|
|
||||||
const auto options = std::vector<std::string>{kArgM, kArgN, kArgLayout,
|
|
||||||
kArgSide, kArgTriangle,
|
|
||||||
kArgALeadDim, kArgBLeadDim, kArgCLeadDim,
|
|
||||||
kArgAOffset, kArgBOffset, kArgCOffset};
|
|
||||||
|
|
||||||
// Creates a tester
|
|
||||||
TestABC<T> tester{argc, argv, silent, name, options, clblast_lambda, clblas_lambda};
|
|
||||||
|
|
||||||
// Loops over the test-cases from a data-layout point of view
|
|
||||||
for (auto &layout: tester.kLayouts) {
|
|
||||||
args.layout = layout;
|
|
||||||
for (auto &side: {Side::kLeft, Side::kRight}) {
|
|
||||||
args.side = side;
|
|
||||||
for (auto &triangle: {Triangle::kUpper, Triangle::kLower}) {
|
|
||||||
args.triangle = triangle;
|
|
||||||
const auto case_name = ToString(layout)+" "+ToString(side)+" "+ToString(triangle);
|
|
||||||
|
|
||||||
// Runs the tests
|
|
||||||
tester.TestRegular(args, case_name);
|
|
||||||
tester.TestInvalidBufferSizes(args, case_name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
} // namespace clblast
|
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
clblast::XsymmTest<float>(argc, argv, false, "SSYMM");
|
|
||||||
clblast::XsymmTest<double>(argc, argv, true, "DSYMM");
|
|
||||||
clblast::XsymmTest<clblast::float2>(argc, argv, true, "CSYMM");
|
|
||||||
clblast::XsymmTest<clblast::double2>(argc, argv, true, "ZSYMM");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
|
@ -1,217 +0,0 @@
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
|
||||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
||||||
// width of 100 characters per line.
|
|
||||||
//
|
|
||||||
// Author(s):
|
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
||||||
//
|
|
||||||
// This file implements the TestABC class (see the header for information about the class).
|
|
||||||
//
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#include "correctness/testabc.h"
|
|
||||||
|
|
||||||
namespace clblast {
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Constructor, initializes the base class tester and input data
|
|
||||||
template <typename T>
|
|
||||||
TestABC<T>::TestABC(int argc, char *argv[], const bool silent,
|
|
||||||
const std::string &name, const std::vector<std::string> &options,
|
|
||||||
const Routine clblast_lambda, const Routine clblas_lambda):
|
|
||||||
Tester<T>{argc, argv, silent, name, options},
|
|
||||||
clblast_lambda_(clblast_lambda),
|
|
||||||
clblas_lambda_(clblas_lambda) {
|
|
||||||
|
|
||||||
// Computes the maximum sizes. This allows for a single set of input/output buffers.
|
|
||||||
auto max_dim = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
|
|
||||||
auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
|
|
||||||
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
|
|
||||||
|
|
||||||
// Creates test input data
|
|
||||||
a_source_.resize(max_dim*max_ld + max_offset);
|
|
||||||
b_source_.resize(max_dim*max_ld + max_offset);
|
|
||||||
c_source_.resize(max_dim*max_ld + max_offset);
|
|
||||||
PopulateVector(a_source_);
|
|
||||||
PopulateVector(b_source_);
|
|
||||||
PopulateVector(c_source_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ===============================================================================================
|
|
||||||
|
|
||||||
// Tests the routine for a wide variety of parameters
|
|
||||||
template <typename T>
|
|
||||||
void TestABC<T>::TestRegular(Arguments<T> &args, const std::string &name) {
|
|
||||||
if (!PrecisionSupported()) { return; }
|
|
||||||
TestStart("regular behaviour", name);
|
|
||||||
|
|
||||||
// Computes whether or not the matrices are transposed. Note that we assume a default of
|
|
||||||
// column-major and no-transpose. If one of them is different (but not both), then rotated
|
|
||||||
// is considered true.
|
|
||||||
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
|
|
||||||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
|
|
||||||
auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
|
|
||||||
(args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
|
|
||||||
auto c_rotated = (args.layout == Layout::kRowMajor);
|
|
||||||
|
|
||||||
// Iterates over the matrix dimensions
|
|
||||||
for (auto &m: kMatrixDims) {
|
|
||||||
args.m = m;
|
|
||||||
for (auto &n: kMatrixDims) {
|
|
||||||
args.n = n;
|
|
||||||
for (auto &k: kMatrixDims) {
|
|
||||||
args.k = k;
|
|
||||||
|
|
||||||
// Computes the second dimensions of the matrices taking the rotation into account
|
|
||||||
auto a_two = (a_rotated) ? m : k;
|
|
||||||
auto b_two = (b_rotated) ? k : n;
|
|
||||||
auto c_two = (c_rotated) ? m : n;
|
|
||||||
|
|
||||||
// Iterates over the leading-dimension values and the offsets
|
|
||||||
for (auto &a_ld: kMatrixDims) {
|
|
||||||
args.a_ld = a_ld;
|
|
||||||
for (auto &a_offset: kOffsets) {
|
|
||||||
args.a_offset = a_offset;
|
|
||||||
for (auto &b_ld: kMatrixDims) {
|
|
||||||
args.b_ld = b_ld;
|
|
||||||
for (auto &b_offset: kOffsets) {
|
|
||||||
args.b_offset = b_offset;
|
|
||||||
for (auto &c_ld: kMatrixDims) {
|
|
||||||
args.c_ld = c_ld;
|
|
||||||
for (auto &c_offset: kOffsets) {
|
|
||||||
args.c_offset = c_offset;
|
|
||||||
|
|
||||||
// Computes the buffer sizes
|
|
||||||
auto a_size = a_two * a_ld + a_offset;
|
|
||||||
auto b_size = b_two * b_ld + b_offset;
|
|
||||||
auto c_size = c_two * c_ld + c_offset;
|
|
||||||
if (a_size < 1 || b_size < 1 || c_size < 1) { continue; }
|
|
||||||
|
|
||||||
// Creates the OpenCL buffers
|
|
||||||
auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
|
|
||||||
auto b_mat = Buffer(context_, CL_MEM_READ_WRITE, b_size*sizeof(T));
|
|
||||||
auto r_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
|
|
||||||
auto s_mat = Buffer(context_, CL_MEM_READ_WRITE, c_size*sizeof(T));
|
|
||||||
|
|
||||||
// Iterates over the values for alpha and beta
|
|
||||||
for (auto &alpha: kAlphaValues) {
|
|
||||||
args.alpha = alpha;
|
|
||||||
for (auto &beta: kBetaValues) {
|
|
||||||
args.beta = beta;
|
|
||||||
|
|
||||||
// Runs the reference clBLAS code
|
|
||||||
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
|
|
||||||
b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
|
|
||||||
r_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
|
|
||||||
auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
|
|
||||||
|
|
||||||
// Runs the CLBlast code
|
|
||||||
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
|
|
||||||
b_mat.WriteBuffer(queue_, b_size*sizeof(T), b_source_);
|
|
||||||
s_mat.WriteBuffer(queue_, c_size*sizeof(T), c_source_);
|
|
||||||
auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
|
|
||||||
|
|
||||||
// Tests for equality of the two status codes
|
|
||||||
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
|
|
||||||
TestErrorCodes(status1, status2, args);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Downloads the results
|
|
||||||
std::vector<T> r_result(c_size, static_cast<T>(0));
|
|
||||||
std::vector<T> s_result(c_size, static_cast<T>(0));
|
|
||||||
r_mat.ReadBuffer(queue_, c_size*sizeof(T), r_result);
|
|
||||||
s_mat.ReadBuffer(queue_, c_size*sizeof(T), s_result);
|
|
||||||
|
|
||||||
// Checks for differences in the output
|
|
||||||
auto errors = size_t{0};
|
|
||||||
for (auto idm=size_t{0}; idm<m; ++idm) {
|
|
||||||
for (auto idn=size_t{0}; idn<n; ++idn) {
|
|
||||||
auto index = (args.layout == Layout::kRowMajor) ?
|
|
||||||
idm*args.c_ld + idn + args.c_offset:
|
|
||||||
idn*args.c_ld + idm + args.c_offset;
|
|
||||||
if (!TestSimilarity(r_result[index], s_result[index])) {
|
|
||||||
errors++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tests the error count (should be zero)
|
|
||||||
TestErrorCount(errors, m*n, args);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
TestEnd();
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
|
|
||||||
// does not test for results (if any).
|
|
||||||
template <typename T>
|
|
||||||
void TestABC<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
|
|
||||||
if (!PrecisionSupported()) { return; }
|
|
||||||
TestStart("invalid buffer sizes", name);
|
|
||||||
|
|
||||||
// Sets example test parameters
|
|
||||||
args.m = kBufferSize;
|
|
||||||
args.n = kBufferSize;
|
|
||||||
args.k = kBufferSize;
|
|
||||||
args.a_ld = kBufferSize;
|
|
||||||
args.b_ld = kBufferSize;
|
|
||||||
args.c_ld = kBufferSize;
|
|
||||||
args.a_offset = 0;
|
|
||||||
args.b_offset = 0;
|
|
||||||
args.c_offset = 0;
|
|
||||||
|
|
||||||
// Iterates over test buffer sizes
|
|
||||||
const std::vector<size_t> kBufferSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
|
|
||||||
for (auto &a_size: kBufferSizes) {
|
|
||||||
for (auto &b_size: kBufferSizes) {
|
|
||||||
for (auto &c_size: kBufferSizes) {
|
|
||||||
|
|
||||||
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
|
|
||||||
// want to be able to create invalid buffers (no error checking here).
|
|
||||||
auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto a_mat = Buffer(a);
|
|
||||||
auto b = clCreateBuffer(context_(), CL_MEM_READ_WRITE, b_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto b_mat = Buffer(b);
|
|
||||||
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto r_mat = Buffer(r);
|
|
||||||
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, c_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto s_mat = Buffer(s);
|
|
||||||
|
|
||||||
// Runs the two routines
|
|
||||||
auto status1 = clblas_lambda_(args, a_mat, b_mat, r_mat, queue_);
|
|
||||||
auto status2 = clblast_lambda_(args, a_mat, b_mat, s_mat, queue_);
|
|
||||||
|
|
||||||
// Tests for equality of the two status codes
|
|
||||||
TestErrorCodes(status1, status2, args);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
TestEnd();
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Compiles the templated class
|
|
||||||
template class TestABC<float>;
|
|
||||||
template class TestABC<double>;
|
|
||||||
template class TestABC<float2>;
|
|
||||||
template class TestABC<double2>;
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
} // namespace clblast
|
|
|
@ -1,86 +0,0 @@
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
|
||||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
||||||
// width of 100 characters per line.
|
|
||||||
//
|
|
||||||
// Author(s):
|
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
||||||
//
|
|
||||||
// This file tests any mat-mat-mat (A,B,C) routine. It contains two types of tests: one testing
|
|
||||||
// all sorts of input combinations, and one deliberatly testing with invalid values.
|
|
||||||
//
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
#ifndef CLBLAST_TEST_CORRECTNESS_TESTABC_H_
|
|
||||||
#define CLBLAST_TEST_CORRECTNESS_TESTABC_H_
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "correctness/tester.h"
|
|
||||||
|
|
||||||
namespace clblast {
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// See comment at top of file for a description of the class
|
|
||||||
template <typename T>
|
|
||||||
class TestABC: public Tester<T> {
|
|
||||||
public:
|
|
||||||
|
|
||||||
// Uses several variables from the Tester class
|
|
||||||
using Tester<T>::context_;
|
|
||||||
using Tester<T>::queue_;
|
|
||||||
using Tester<T>::kLayouts;
|
|
||||||
using Tester<T>::kTransposes;
|
|
||||||
|
|
||||||
// Uses several helper functions from the Tester class
|
|
||||||
using Tester<T>::TestStart;
|
|
||||||
using Tester<T>::TestEnd;
|
|
||||||
using Tester<T>::TestSimilarity;
|
|
||||||
using Tester<T>::TestErrorCount;
|
|
||||||
using Tester<T>::TestErrorCodes;
|
|
||||||
using Tester<T>::GetExampleScalars;
|
|
||||||
using Tester<T>::GetOffsets;
|
|
||||||
using Tester<T>::PrecisionSupported;
|
|
||||||
|
|
||||||
// Test settings for the regular test. Append to this list in case more tests are required.
|
|
||||||
const std::vector<size_t> kMatrixDims = { 7, 64 };
|
|
||||||
const std::vector<size_t> kOffsets = GetOffsets();
|
|
||||||
const std::vector<T> kAlphaValues = GetExampleScalars();
|
|
||||||
const std::vector<T> kBetaValues = GetExampleScalars();
|
|
||||||
|
|
||||||
// Test settings for the invalid test
|
|
||||||
const size_t kBufferSize = 64;
|
|
||||||
|
|
||||||
// Shorthand for a BLAS routine
|
|
||||||
using Routine = std::function<StatusCode(const Arguments<T>&,
|
|
||||||
const Buffer&, const Buffer&, const Buffer&,
|
|
||||||
CommandQueue&)>;
|
|
||||||
|
|
||||||
// Constructor, initializes the base class tester and input data
|
|
||||||
TestABC(int argc, char *argv[], const bool silent,
|
|
||||||
const std::string &name, const std::vector<std::string> &options,
|
|
||||||
const Routine clblast_lambda, const Routine clblas_lambda);
|
|
||||||
|
|
||||||
// The test functions, taking no inputs
|
|
||||||
void TestRegular(Arguments<T> &args, const std::string &name);
|
|
||||||
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
// Source data to test with
|
|
||||||
std::vector<T> a_source_;
|
|
||||||
std::vector<T> b_source_;
|
|
||||||
std::vector<T> c_source_;
|
|
||||||
|
|
||||||
// The routines to test
|
|
||||||
Routine clblast_lambda_;
|
|
||||||
Routine clblas_lambda_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
} // namespace clblast
|
|
||||||
|
|
||||||
// CLBLAST_TEST_CORRECTNESS_TESTABC_H_
|
|
||||||
#endif
|
|
|
@ -1,213 +0,0 @@
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
|
||||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
||||||
// width of 100 characters per line.
|
|
||||||
//
|
|
||||||
// Author(s):
|
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
||||||
//
|
|
||||||
// This file implements the TestAXY class (see the header for information about the class).
|
|
||||||
//
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#include "correctness/testaxy.h"
|
|
||||||
|
|
||||||
namespace clblast {
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Constructor, initializes the base class tester and input data
|
|
||||||
template <typename T>
|
|
||||||
TestAXY<T>::TestAXY(int argc, char *argv[], const bool silent,
|
|
||||||
const std::string &name, const std::vector<std::string> &options,
|
|
||||||
const Routine clblast_lambda, const Routine clblas_lambda):
|
|
||||||
Tester<T>{argc, argv, silent, name, options},
|
|
||||||
clblast_lambda_(clblast_lambda),
|
|
||||||
clblas_lambda_(clblas_lambda) {
|
|
||||||
|
|
||||||
// Computes the maximum sizes. This allows for a single set of input/output buffers.
|
|
||||||
auto max_dim = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
|
|
||||||
auto max_ld = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
|
|
||||||
auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
|
|
||||||
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
|
|
||||||
|
|
||||||
// Creates test input data
|
|
||||||
a_source_.resize(max_dim*max_ld + max_offset);
|
|
||||||
x_source_.resize(max_dim*max_inc + max_offset);
|
|
||||||
y_source_.resize(max_dim*max_inc + max_offset);
|
|
||||||
PopulateVector(a_source_);
|
|
||||||
PopulateVector(x_source_);
|
|
||||||
PopulateVector(y_source_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ===============================================================================================
|
|
||||||
|
|
||||||
// Tests the routine for a wide variety of parameters
|
|
||||||
template <typename T>
|
|
||||||
void TestAXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
|
|
||||||
if (!PrecisionSupported()) { return; }
|
|
||||||
TestStart("regular behaviour", name);
|
|
||||||
|
|
||||||
// Iterates over the dimension for the matrix and vectors
|
|
||||||
for (auto &m: kMatrixVectorDims) {
|
|
||||||
args.m = m;
|
|
||||||
for (auto &n: kMatrixVectorDims) {
|
|
||||||
args.n = n;
|
|
||||||
|
|
||||||
// Computes the second dimension of the matrix taking the rotation into account
|
|
||||||
auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
|
|
||||||
|
|
||||||
// Computes the vector sizes in case the matrix is transposed
|
|
||||||
auto a_transposed = (args.a_transpose == Transpose::kYes);
|
|
||||||
auto m_real = (a_transposed) ? n : m;
|
|
||||||
auto n_real = (a_transposed) ? m : n;
|
|
||||||
|
|
||||||
// Iterates over the leading-dimension values and the offsets of the matrix
|
|
||||||
for (auto &a_ld: kMatrixVectorDims) {
|
|
||||||
args.a_ld = a_ld;
|
|
||||||
for (auto &a_offset: kOffsets) {
|
|
||||||
args.a_offset = a_offset;
|
|
||||||
|
|
||||||
// Iterates over the increment-values and the offsets of the vectors
|
|
||||||
for (auto &x_inc: kIncrements) {
|
|
||||||
args.x_inc = x_inc;
|
|
||||||
for (auto &x_offset: kOffsets) {
|
|
||||||
args.x_offset = x_offset;
|
|
||||||
for (auto &y_inc: kIncrements) {
|
|
||||||
args.y_inc = y_inc;
|
|
||||||
for (auto &y_offset: kOffsets) {
|
|
||||||
args.y_offset = y_offset;
|
|
||||||
|
|
||||||
// Computes the buffer sizes
|
|
||||||
auto a_size = a_two * a_ld + a_offset;
|
|
||||||
auto x_size = n_real * x_inc + x_offset;
|
|
||||||
auto y_size = m_real * y_inc + y_offset;
|
|
||||||
if (a_size < 1 || x_size < 1 || y_size < 1) { continue; }
|
|
||||||
|
|
||||||
// Creates the OpenCL buffers
|
|
||||||
auto a_mat = Buffer(context_, CL_MEM_READ_WRITE, a_size*sizeof(T));
|
|
||||||
auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
|
|
||||||
auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
|
|
||||||
auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
|
|
||||||
|
|
||||||
// Iterates over the values for alpha and beta
|
|
||||||
for (auto &alpha: kAlphaValues) {
|
|
||||||
args.alpha = alpha;
|
|
||||||
for (auto &beta: kBetaValues) {
|
|
||||||
args.beta = beta;
|
|
||||||
|
|
||||||
// Runs the reference clBLAS code
|
|
||||||
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
|
|
||||||
x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
|
|
||||||
r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
|
|
||||||
auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
|
|
||||||
|
|
||||||
// Runs the CLBlast code
|
|
||||||
a_mat.WriteBuffer(queue_, a_size*sizeof(T), a_source_);
|
|
||||||
x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
|
|
||||||
s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
|
|
||||||
auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
|
|
||||||
|
|
||||||
// Tests for equality of the two status codes
|
|
||||||
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
|
|
||||||
TestErrorCodes(status1, status2, args);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Downloads the results
|
|
||||||
std::vector<T> r_result(y_size, static_cast<T>(0));
|
|
||||||
std::vector<T> s_result(y_size, static_cast<T>(0));
|
|
||||||
r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
|
|
||||||
s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
|
|
||||||
|
|
||||||
// Checks for differences in the output
|
|
||||||
auto errors = size_t{0};
|
|
||||||
for (auto idm=size_t{0}; idm<m_real; ++idm) {
|
|
||||||
auto index = idm*y_inc + y_offset;
|
|
||||||
if (!TestSimilarity(r_result[index], s_result[index])) {
|
|
||||||
errors++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tests the error count (should be zero)
|
|
||||||
TestErrorCount(errors, m_real, args);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
TestEnd();
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
|
|
||||||
// does not test for results (if any).
|
|
||||||
template <typename T>
|
|
||||||
void TestAXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
|
|
||||||
if (!PrecisionSupported()) { return; }
|
|
||||||
TestStart("invalid buffer sizes", name);
|
|
||||||
|
|
||||||
// Sets example test parameters
|
|
||||||
args.m = kBufferSize;
|
|
||||||
args.n = kBufferSize;
|
|
||||||
args.a_ld = kBufferSize;
|
|
||||||
args.a_offset = 0;
|
|
||||||
args.x_offset = 0;
|
|
||||||
args.y_offset = 0;
|
|
||||||
|
|
||||||
// Iterates over test buffer sizes
|
|
||||||
const std::vector<size_t> kMatrixSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
|
|
||||||
const std::vector<size_t> kVectorSizes = {0, kBufferSize - 1, kBufferSize};
|
|
||||||
for (auto &a_size: kMatrixSizes) {
|
|
||||||
for (auto &x_size: kVectorSizes) {
|
|
||||||
for (auto &y_size: kVectorSizes) {
|
|
||||||
|
|
||||||
// Iterates over test increments
|
|
||||||
for (auto &x_inc: kInvalidIncrements) {
|
|
||||||
args.x_inc = x_inc;
|
|
||||||
for (auto &y_inc: kInvalidIncrements) {
|
|
||||||
args.y_inc = y_inc;
|
|
||||||
|
|
||||||
// Creates the OpenCL buffers. Note: we are not using the C++ version since we
|
|
||||||
// explicitly want to be able to create invalid buffers (no error checking here).
|
|
||||||
auto a = clCreateBuffer(context_(), CL_MEM_READ_WRITE, a_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto a_mat = Buffer(a);
|
|
||||||
auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto x_vec = Buffer(x);
|
|
||||||
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto r_vec = Buffer(r);
|
|
||||||
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto s_vec = Buffer(s);
|
|
||||||
|
|
||||||
// Runs the two routines
|
|
||||||
auto status1 = clblas_lambda_(args, a_mat, x_vec, r_vec, queue_);
|
|
||||||
auto status2 = clblast_lambda_(args, a_mat, x_vec, s_vec, queue_);
|
|
||||||
|
|
||||||
// Tests for equality of the two status codes
|
|
||||||
TestErrorCodes(status1, status2, args);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
TestEnd();
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Compiles the templated class
|
|
||||||
template class TestAXY<float>;
|
|
||||||
template class TestAXY<double>;
|
|
||||||
template class TestAXY<float2>;
|
|
||||||
template class TestAXY<double2>;
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
} // namespace clblast
|
|
|
@ -1,88 +0,0 @@
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
|
||||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
||||||
// width of 100 characters per line.
|
|
||||||
//
|
|
||||||
// Author(s):
|
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
||||||
//
|
|
||||||
// This file tests any mat-vec-vec (A,X,Y) routine. It contains two types of tests: one testing
|
|
||||||
// all sorts of input combinations, and one deliberatly testing with invalid values.
|
|
||||||
//
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
#ifndef CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
|
|
||||||
#define CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "correctness/tester.h"
|
|
||||||
|
|
||||||
namespace clblast {
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// See comment at top of file for a description of the class
|
|
||||||
template <typename T>
|
|
||||||
class TestAXY: public Tester<T> {
|
|
||||||
public:
|
|
||||||
|
|
||||||
// Uses several variables from the Tester class
|
|
||||||
using Tester<T>::context_;
|
|
||||||
using Tester<T>::queue_;
|
|
||||||
using Tester<T>::kLayouts;
|
|
||||||
using Tester<T>::kTransposes;
|
|
||||||
|
|
||||||
// Uses several helper functions from the Tester class
|
|
||||||
using Tester<T>::TestStart;
|
|
||||||
using Tester<T>::TestEnd;
|
|
||||||
using Tester<T>::TestSimilarity;
|
|
||||||
using Tester<T>::TestErrorCount;
|
|
||||||
using Tester<T>::TestErrorCodes;
|
|
||||||
using Tester<T>::GetExampleScalars;
|
|
||||||
using Tester<T>::GetOffsets;
|
|
||||||
using Tester<T>::PrecisionSupported;
|
|
||||||
|
|
||||||
// Test settings for the regular test. Append to this list in case more tests are required.
|
|
||||||
const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
|
|
||||||
const std::vector<size_t> kOffsets = GetOffsets();
|
|
||||||
const std::vector<size_t> kIncrements = { 1, 2 };
|
|
||||||
const std::vector<T> kAlphaValues = GetExampleScalars();
|
|
||||||
const std::vector<T> kBetaValues = GetExampleScalars();
|
|
||||||
|
|
||||||
// Test settings for the invalid test
|
|
||||||
const std::vector<size_t> kInvalidIncrements = { 0, 1 };
|
|
||||||
const size_t kBufferSize = 64;
|
|
||||||
|
|
||||||
// Shorthand for a BLAS routine
|
|
||||||
using Routine = std::function<StatusCode(const Arguments<T>&,
|
|
||||||
const Buffer&, const Buffer&, const Buffer&,
|
|
||||||
CommandQueue&)>;
|
|
||||||
|
|
||||||
// Constructor, initializes the base class tester and input data
|
|
||||||
TestAXY(int argc, char *argv[], const bool silent,
|
|
||||||
const std::string &name, const std::vector<std::string> &options,
|
|
||||||
const Routine clblast_lambda, const Routine clblas_lambda);
|
|
||||||
|
|
||||||
// The test functions, taking no inputs
|
|
||||||
void TestRegular(Arguments<T> &args, const std::string &name);
|
|
||||||
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
// Source data to test with
|
|
||||||
std::vector<T> a_source_;
|
|
||||||
std::vector<T> x_source_;
|
|
||||||
std::vector<T> y_source_;
|
|
||||||
|
|
||||||
// The routines to test
|
|
||||||
Routine clblast_lambda_;
|
|
||||||
Routine clblas_lambda_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
} // namespace clblast
|
|
||||||
|
|
||||||
// CLBLAST_TEST_CORRECTNESS_TESTAXY_H_
|
|
||||||
#endif
|
|
189
test/correctness/testblas.cc
Normal file
189
test/correctness/testblas.cc
Normal file
|
@ -0,0 +1,189 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the TestBlas class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
#include "correctness/testblas.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The transpose-options to test with (data-type dependent)
|
||||||
|
template <> const std::vector<Transpose> TestBlas<float,float>::kTransposes = {Transpose::kNo, Transpose::kYes};
|
||||||
|
template <> const std::vector<Transpose> TestBlas<double,double>::kTransposes = {Transpose::kNo, Transpose::kYes};
|
||||||
|
template <> const std::vector<Transpose> TestBlas<float2,float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
|
||||||
|
template <> const std::vector<Transpose> TestBlas<double2,double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
|
||||||
|
template <> const std::vector<Transpose> TestBlas<float2,float>::kTransposes = {Transpose::kNo, Transpose::kConjugate};
|
||||||
|
template <> const std::vector<Transpose> TestBlas<double2,double>::kTransposes = {Transpose::kNo, Transpose::kConjugate};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor, initializes the base class tester and input data
|
||||||
|
template <typename T, typename U>
|
||||||
|
TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
|
||||||
|
const std::string &name, const std::vector<std::string> &options,
|
||||||
|
const Routine run_routine, const Routine run_reference,
|
||||||
|
const ResultGet get_result, const ResultIndex get_index,
|
||||||
|
const ResultIterator get_id1, const ResultIterator get_id2):
|
||||||
|
Tester<T,U>{argc, argv, silent, name, options},
|
||||||
|
run_routine_(run_routine),
|
||||||
|
run_reference_(run_reference),
|
||||||
|
get_result_(get_result),
|
||||||
|
get_index_(get_index),
|
||||||
|
get_id1_(get_id1),
|
||||||
|
get_id2_(get_id2) {
|
||||||
|
|
||||||
|
// Computes the maximum sizes. This allows for a single set of input/output buffers.
|
||||||
|
auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end());
|
||||||
|
auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
|
||||||
|
auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
|
||||||
|
auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
|
||||||
|
auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
|
||||||
|
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
|
||||||
|
|
||||||
|
// Creates test input data
|
||||||
|
x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
|
||||||
|
y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
|
||||||
|
a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
|
||||||
|
b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
|
||||||
|
c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
|
||||||
|
PopulateVector(x_source_);
|
||||||
|
PopulateVector(y_source_);
|
||||||
|
PopulateVector(a_source_);
|
||||||
|
PopulateVector(b_source_);
|
||||||
|
PopulateVector(c_source_);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===============================================================================================
|
||||||
|
|
||||||
|
// Tests the routine for a wide variety of parameters
|
||||||
|
template <typename T, typename U>
|
||||||
|
void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name) {
|
||||||
|
if (!PrecisionSupported<T>(device_)) { return; }
|
||||||
|
TestStart("regular behaviour", name);
|
||||||
|
|
||||||
|
// Iterates over all the to-be-tested combinations of arguments
|
||||||
|
for (auto &args: test_vector) {
|
||||||
|
|
||||||
|
// Runs the reference clBLAS code
|
||||||
|
auto x_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
|
||||||
|
auto y_vec1 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
|
||||||
|
auto a_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
|
||||||
|
auto b_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
|
||||||
|
auto c_mat1 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
|
||||||
|
x_vec1.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
|
||||||
|
y_vec1.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
|
||||||
|
a_mat1.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
|
||||||
|
b_mat1.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
|
||||||
|
c_mat1.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
|
||||||
|
auto buffers1 = Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1};
|
||||||
|
auto status1 = run_reference_(args, buffers1, queue_);
|
||||||
|
|
||||||
|
// Runs the CLBlast code
|
||||||
|
auto x_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
|
||||||
|
auto y_vec2 = Buffer(context_, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
|
||||||
|
auto a_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
|
||||||
|
auto b_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
|
||||||
|
auto c_mat2 = Buffer(context_, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
|
||||||
|
x_vec2.WriteBuffer(queue_, args.x_size*sizeof(T), x_source_);
|
||||||
|
y_vec2.WriteBuffer(queue_, args.y_size*sizeof(T), y_source_);
|
||||||
|
a_mat2.WriteBuffer(queue_, args.a_size*sizeof(T), a_source_);
|
||||||
|
b_mat2.WriteBuffer(queue_, args.b_size*sizeof(T), b_source_);
|
||||||
|
c_mat2.WriteBuffer(queue_, args.c_size*sizeof(T), c_source_);
|
||||||
|
auto buffers2 = Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2};
|
||||||
|
auto status2 = run_routine_(args, buffers2, queue_);
|
||||||
|
|
||||||
|
// Tests for equality of the two status codes
|
||||||
|
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
|
||||||
|
TestErrorCodes(status1, status2, args);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Downloads the results
|
||||||
|
auto result1 = get_result_(args, buffers1, queue_);
|
||||||
|
auto result2 = get_result_(args, buffers2, queue_);
|
||||||
|
|
||||||
|
// Checks for differences in the output
|
||||||
|
auto errors = size_t{0};
|
||||||
|
for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) {
|
||||||
|
for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) {
|
||||||
|
auto index = get_index_(args, id1, id2);
|
||||||
|
if (!TestSimilarity(result1[index], result2[index])) {
|
||||||
|
errors++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tests the error count (should be zero)
|
||||||
|
TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
|
||||||
|
}
|
||||||
|
TestEnd();
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
|
||||||
|
// does not test for results (if any).
|
||||||
|
template <typename T, typename U>
|
||||||
|
void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name) {
|
||||||
|
if (!PrecisionSupported<T>(device_)) { return; }
|
||||||
|
TestStart("invalid buffer sizes", name);
|
||||||
|
|
||||||
|
// Iterates over all the to-be-tested combinations of arguments
|
||||||
|
for (auto &args: test_vector) {
|
||||||
|
|
||||||
|
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
|
||||||
|
// want to be able to create invalid buffers (no error checking here).
|
||||||
|
auto x1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
|
||||||
|
auto y1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
|
||||||
|
auto a1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
|
||||||
|
auto b1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
|
||||||
|
auto c1 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
|
||||||
|
auto x_vec1 = Buffer(x1);
|
||||||
|
auto y_vec1 = Buffer(y1);
|
||||||
|
auto a_mat1 = Buffer(a1);
|
||||||
|
auto b_mat1 = Buffer(b1);
|
||||||
|
auto c_mat1 = Buffer(c1);
|
||||||
|
auto x2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.x_size*sizeof(T), nullptr,nullptr);
|
||||||
|
auto y2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.y_size*sizeof(T), nullptr,nullptr);
|
||||||
|
auto a2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.a_size*sizeof(T), nullptr,nullptr);
|
||||||
|
auto b2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.b_size*sizeof(T), nullptr,nullptr);
|
||||||
|
auto c2 = clCreateBuffer(context_(), CL_MEM_READ_WRITE, args.c_size*sizeof(T), nullptr,nullptr);
|
||||||
|
auto x_vec2 = Buffer(x2);
|
||||||
|
auto y_vec2 = Buffer(y2);
|
||||||
|
auto a_mat2 = Buffer(a2);
|
||||||
|
auto b_mat2 = Buffer(b2);
|
||||||
|
auto c_mat2 = Buffer(c2);
|
||||||
|
|
||||||
|
// Runs the two routines
|
||||||
|
auto status1 = run_reference_(args, Buffers{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1}, queue_);
|
||||||
|
auto status2 = run_routine_(args, Buffers{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2}, queue_);
|
||||||
|
|
||||||
|
// Tests for equality of the two status codes
|
||||||
|
TestErrorCodes(status1, status2, args);
|
||||||
|
}
|
||||||
|
TestEnd();
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class TestBlas<float, float>;
|
||||||
|
template class TestBlas<double, double>;
|
||||||
|
template class TestBlas<float2, float2>;
|
||||||
|
template class TestBlas<double2, double2>;
|
||||||
|
template class TestBlas<float2, float>;
|
||||||
|
template class TestBlas<double2, double>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
106
test/correctness/testblas.h
Normal file
106
test/correctness/testblas.h
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file tests any CLBlast routine. It contains two types of tests: one testing all sorts of
|
||||||
|
// input combinations, and one deliberatly testing with invalid values.
|
||||||
|
// Typename T: the data-type of the routine's memory buffers (==precision)
|
||||||
|
// Typename U: the data-type of the alpha and beta arguments
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
|
||||||
|
#define CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "correctness/tester.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T, typename U>
|
||||||
|
class TestBlas: public Tester<T,U> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses several variables from the Tester class
|
||||||
|
using Tester<T,U>::context_;
|
||||||
|
using Tester<T,U>::queue_;
|
||||||
|
using Tester<T,U>::full_test_;
|
||||||
|
using Tester<T,U>::device_;
|
||||||
|
|
||||||
|
// Uses several helper functions from the Tester class
|
||||||
|
using Tester<T,U>::TestStart;
|
||||||
|
using Tester<T,U>::TestEnd;
|
||||||
|
using Tester<T,U>::TestErrorCount;
|
||||||
|
using Tester<T,U>::TestErrorCodes;
|
||||||
|
using Tester<T,U>::GetOffsets;
|
||||||
|
|
||||||
|
// Test settings for the regular test. Append to these lists in case more tests are required.
|
||||||
|
const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
|
||||||
|
const std::vector<size_t> kIncrements = { 1, 2, 7 };
|
||||||
|
const std::vector<size_t> kMatrixDims = { 7, 64 };
|
||||||
|
const std::vector<size_t> kMatrixVectorDims = { 61, 512 };
|
||||||
|
const std::vector<size_t> kOffsets = GetOffsets();
|
||||||
|
const std::vector<U> kAlphaValues = GetExampleScalars<U>(full_test_);
|
||||||
|
const std::vector<U> kBetaValues = GetExampleScalars<U>(full_test_);
|
||||||
|
|
||||||
|
// Test settings for the invalid tests
|
||||||
|
const std::vector<size_t> kInvalidIncrements = { 0, 1 };
|
||||||
|
const size_t kBufferSize = 64;
|
||||||
|
const std::vector<size_t> kMatSizes = {0, kBufferSize*kBufferSize-1, kBufferSize*kBufferSize};
|
||||||
|
const std::vector<size_t> kVecSizes = {0, kBufferSize - 1, kBufferSize};
|
||||||
|
|
||||||
|
// The layout/transpose/triangle options to test with
|
||||||
|
const std::vector<Layout> kLayouts = {Layout::kRowMajor, Layout::kColMajor};
|
||||||
|
const std::vector<Triangle> kTriangles = {Triangle::kUpper, Triangle::kLower};
|
||||||
|
const std::vector<Side> kSides = {Side::kLeft, Side::kRight};
|
||||||
|
const std::vector<Diagonal> kDiagonals = {Diagonal::kUnit, Diagonal::kNonUnit};
|
||||||
|
static const std::vector<Transpose> kTransposes; // Data-type dependent, see .cc-file
|
||||||
|
|
||||||
|
// Shorthand for the routine-specific functions passed to the tester
|
||||||
|
using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers&, CommandQueue&)>;
|
||||||
|
using ResultGet = std::function<std::vector<T>(const Arguments<U>&, Buffers&, CommandQueue&)>;
|
||||||
|
using ResultIndex = std::function<size_t(const Arguments<U>&, const size_t, const size_t)>;
|
||||||
|
using ResultIterator = std::function<size_t(const Arguments<U>&)>;
|
||||||
|
|
||||||
|
// Constructor, initializes the base class tester and input data
|
||||||
|
TestBlas(int argc, char *argv[], const bool silent,
|
||||||
|
const std::string &name, const std::vector<std::string> &options,
|
||||||
|
const Routine run_routine, const Routine run_reference, const ResultGet get_result,
|
||||||
|
const ResultIndex get_index, const ResultIterator get_id1, const ResultIterator get_id2);
|
||||||
|
|
||||||
|
// The test functions, taking no inputs
|
||||||
|
void TestRegular(std::vector<Arguments<U>> &test_vector, const std::string &name);
|
||||||
|
void TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name);
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
// Source data to test with
|
||||||
|
std::vector<T> x_source_;
|
||||||
|
std::vector<T> y_source_;
|
||||||
|
std::vector<T> a_source_;
|
||||||
|
std::vector<T> b_source_;
|
||||||
|
std::vector<T> c_source_;
|
||||||
|
|
||||||
|
// The routine-specific functions passed to the tester
|
||||||
|
Routine run_routine_;
|
||||||
|
Routine run_reference_;
|
||||||
|
ResultGet get_result_;
|
||||||
|
ResultIndex get_index_;
|
||||||
|
ResultIterator get_id1_;
|
||||||
|
ResultIterator get_id2_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_TEST_CORRECTNESS_TESTBLAS_H_
|
||||||
|
#endif
|
|
@ -21,21 +21,11 @@
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// The layouts and transpose-options to test with (data-type dependent)
|
|
||||||
template <typename T>
|
|
||||||
const std::vector<Layout> Tester<T>::kLayouts = {Layout::kRowMajor, Layout::kColMajor};
|
|
||||||
template <> const std::vector<Transpose> Tester<float>::kTransposes = {Transpose::kNo, Transpose::kYes};
|
|
||||||
template <> const std::vector<Transpose> Tester<double>::kTransposes = {Transpose::kNo, Transpose::kYes};
|
|
||||||
template <> const std::vector<Transpose> Tester<float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
|
|
||||||
template <> const std::vector<Transpose> Tester<double2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
|
// General constructor for all CLBlast testers. It prints out the test header to stdout and sets-up
|
||||||
// the clBLAS library for reference.
|
// the clBLAS library for reference.
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
Tester<T>::Tester(int argc, char *argv[], const bool silent,
|
Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
|
||||||
const std::string &name, const std::vector<std::string> &options):
|
const std::string &name, const std::vector<std::string> &options):
|
||||||
help_("Options given/available:\n"),
|
help_("Options given/available:\n"),
|
||||||
platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))),
|
platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))),
|
||||||
device_(Device(platform_, kDeviceType, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))),
|
device_(Device(platform_, kDeviceType, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))),
|
||||||
|
@ -61,7 +51,7 @@ Tester<T>::Tester(int argc, char *argv[], const bool silent,
|
||||||
kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str());
|
kPrintMessage.c_str(), name.c_str(), kPrintEnd.c_str());
|
||||||
|
|
||||||
// Checks whether the precision is supported
|
// Checks whether the precision is supported
|
||||||
if (!PrecisionSupported()) {
|
if (!PrecisionSupported<T>(device_)) {
|
||||||
fprintf(stdout, "\n* All tests skipped: %sUnsupported precision%s\n",
|
fprintf(stdout, "\n* All tests skipped: %sUnsupported precision%s\n",
|
||||||
kPrintWarning.c_str(), kPrintEnd.c_str());
|
kPrintWarning.c_str(), kPrintEnd.c_str());
|
||||||
return;
|
return;
|
||||||
|
@ -86,9 +76,9 @@ Tester<T>::Tester(int argc, char *argv[], const bool silent,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Destructor prints the summary of the test cases and cleans-up the clBLAS library
|
// Destructor prints the summary of the test cases and cleans-up the clBLAS library
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
Tester<T>::~Tester() {
|
Tester<T,U>::~Tester() {
|
||||||
if (PrecisionSupported()) {
|
if (PrecisionSupported<T>(device_)) {
|
||||||
fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
|
fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
|
||||||
fprintf(stdout, " %lu test(s) passed\n", tests_passed_);
|
fprintf(stdout, " %lu test(s) passed\n", tests_passed_);
|
||||||
if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
|
if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
|
||||||
|
@ -104,8 +94,8 @@ Tester<T>::~Tester() {
|
||||||
|
|
||||||
// Function called at the start of each test. This prints a header with information about the
|
// Function called at the start of each test. This prints a header with information about the
|
||||||
// test and re-initializes all test data-structures.
|
// test and re-initializes all test data-structures.
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
void Tester<T>::TestStart(const std::string &test_name, const std::string &test_configuration) {
|
void Tester<T,U>::TestStart(const std::string &test_name, const std::string &test_configuration) {
|
||||||
|
|
||||||
// Prints the header
|
// Prints the header
|
||||||
fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n",
|
fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n",
|
||||||
|
@ -123,8 +113,8 @@ void Tester<T>::TestStart(const std::string &test_name, const std::string &test_
|
||||||
|
|
||||||
// Function called at the end of each test. This prints errors if any occured. It also prints a
|
// Function called at the end of each test. This prints errors if any occured. It also prints a
|
||||||
// summary of the number of sub-tests passed/failed.
|
// summary of the number of sub-tests passed/failed.
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
void Tester<T>::TestEnd() {
|
void Tester<T,U>::TestEnd() {
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
tests_passed_ += num_passed_;
|
tests_passed_ += num_passed_;
|
||||||
tests_failed_ += num_skipped_;
|
tests_failed_ += num_skipped_;
|
||||||
|
@ -147,6 +137,7 @@ void Tester<T>::TestEnd() {
|
||||||
if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
|
if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
|
||||||
if (o == kArgSide) { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
|
if (o == kArgSide) { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
|
||||||
if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
|
if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
|
||||||
|
if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
|
||||||
if (o == kArgXInc) { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
|
if (o == kArgXInc) { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
|
||||||
if (o == kArgYInc) { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
|
if (o == kArgYInc) { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
|
||||||
if (o == kArgXOffset) { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
|
if (o == kArgXOffset) { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
|
||||||
|
@ -181,45 +172,9 @@ void Tester<T>::TestEnd() {
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Compares two floating point values and returns whether they are within an acceptable error
|
|
||||||
// margin. This replaces GTest's EXPECT_NEAR().
|
|
||||||
template <typename T>
|
|
||||||
bool Tester<T>::TestSimilarity(const T val1, const T val2) {
|
|
||||||
const auto difference = std::fabs(val1 - val2);
|
|
||||||
|
|
||||||
// Shortcut, handles infinities
|
|
||||||
if (val1 == val2) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// The values are zero or very small: the relative error is less meaningful
|
|
||||||
else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
|
|
||||||
return (difference < static_cast<T>(kErrorMarginAbsolute));
|
|
||||||
}
|
|
||||||
// Use relative error
|
|
||||||
else {
|
|
||||||
return (difference / (std::fabs(val1)+std::fabs(val2))) < static_cast<T>(kErrorMarginRelative);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Specialisations for complex data-types
|
|
||||||
template <>
|
|
||||||
bool Tester<float2>::TestSimilarity(const float2 val1, const float2 val2) {
|
|
||||||
auto real = Tester<float>::TestSimilarity(val1.real(), val2.real());
|
|
||||||
auto imag = Tester<float>::TestSimilarity(val1.imag(), val2.imag());
|
|
||||||
return (real && imag);
|
|
||||||
}
|
|
||||||
template <>
|
|
||||||
bool Tester<double2>::TestSimilarity(const double2 val1, const double2 val2) {
|
|
||||||
auto real = Tester<double>::TestSimilarity(val1.real(), val2.real());
|
|
||||||
auto imag = Tester<double>::TestSimilarity(val1.imag(), val2.imag());
|
|
||||||
return (real && imag);
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Handles a 'pass' or 'error' depending on whether there are any errors
|
// Handles a 'pass' or 'error' depending on whether there are any errors
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args) {
|
void Tester<T,U>::TestErrorCount(const size_t errors, const size_t size, const Arguments<U> &args) {
|
||||||
|
|
||||||
// Finished successfully
|
// Finished successfully
|
||||||
if (errors == 0) {
|
if (errors == 0) {
|
||||||
|
@ -237,9 +192,9 @@ void Tester<T>::TestErrorCount(const size_t errors, const size_t size, const Arg
|
||||||
|
|
||||||
// Compares two status codes for equality. The outcome can be a pass (they are the same), a warning
|
// Compares two status codes for equality. The outcome can be a pass (they are the same), a warning
|
||||||
// (CLBlast reported a compilation error), or an error (they are different).
|
// (CLBlast reported a compilation error), or an error (they are different).
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
|
void Tester<T,U>::TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
|
||||||
const Arguments<T> &args) {
|
const Arguments<U> &args) {
|
||||||
|
|
||||||
// Finished successfully
|
// Finished successfully
|
||||||
if (clblas_status == clblast_status) {
|
if (clblas_status == clblast_status) {
|
||||||
|
@ -270,62 +225,26 @@ void Tester<T>::TestErrorCodes(const StatusCode clblas_status, const StatusCode
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
|
|
||||||
// routines. This function is specialised for the different data-types.
|
|
||||||
template <>
|
|
||||||
const std::vector<float> Tester<float>::GetExampleScalars() {
|
|
||||||
if (full_test_) { return {0.0f, 1.0f, 3.14f}; }
|
|
||||||
else { return {3.14f}; }
|
|
||||||
}
|
|
||||||
template <>
|
|
||||||
const std::vector<double> Tester<double>::GetExampleScalars() {
|
|
||||||
if (full_test_) { return {0.0, 1.0, 3.14}; }
|
|
||||||
else { return {3.14}; }
|
|
||||||
}
|
|
||||||
template <>
|
|
||||||
const std::vector<float2> Tester<float2>::GetExampleScalars() {
|
|
||||||
if (full_test_) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
|
|
||||||
else { return {{2.42f, 3.14f}}; }
|
|
||||||
}
|
|
||||||
template <>
|
|
||||||
const std::vector<double2> Tester<double2>::GetExampleScalars() {
|
|
||||||
if (full_test_) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
|
|
||||||
else { return {{2.42, 3.14}}; }
|
|
||||||
}
|
|
||||||
|
|
||||||
// Retrieves the offset values to test with
|
// Retrieves the offset values to test with
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
const std::vector<size_t> Tester<T>::GetOffsets() {
|
const std::vector<size_t> Tester<T,U>::GetOffsets() const {
|
||||||
if (full_test_) { return {0, 10}; }
|
if (full_test_) { return {0, 10}; }
|
||||||
else { return {0}; }
|
else { return {0}; }
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
template <> bool Tester<float>::PrecisionSupported() const { return true; }
|
|
||||||
template <> bool Tester<float2>::PrecisionSupported() const { return true; }
|
|
||||||
template <> bool Tester<double>::PrecisionSupported() const {
|
|
||||||
auto extensions = device_.Extensions();
|
|
||||||
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
|
|
||||||
}
|
|
||||||
template <> bool Tester<double2>::PrecisionSupported() const {
|
|
||||||
auto extensions = device_.Extensions();
|
|
||||||
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// A test can either pass, be skipped, or fail
|
// A test can either pass, be skipped, or fail
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
void Tester<T>::ReportPass() {
|
void Tester<T,U>::ReportPass() {
|
||||||
num_passed_++;
|
num_passed_++;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
void Tester<T>::ReportSkipped() {
|
void Tester<T,U>::ReportSkipped() {
|
||||||
num_skipped_++;
|
num_skipped_++;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
|
void Tester<T,U>::ReportError(const ErrorLogEntry &error_log_entry) {
|
||||||
error_log_.push_back(error_log_entry);
|
error_log_.push_back(error_log_entry);
|
||||||
num_failed_++;
|
num_failed_++;
|
||||||
}
|
}
|
||||||
|
@ -334,8 +253,8 @@ void Tester<T>::ReportError(const ErrorLogEntry &error_log_entry) {
|
||||||
|
|
||||||
// Prints the test-result symbol to screen. This function limits the maximum number of symbols per
|
// Prints the test-result symbol to screen. This function limits the maximum number of symbols per
|
||||||
// line by printing newlines once every so many calls.
|
// line by printing newlines once every so many calls.
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
void Tester<T>::PrintTestResult(const std::string &message) {
|
void Tester<T,U>::PrintTestResult(const std::string &message) {
|
||||||
if (print_count_ == kResultsPerLine) {
|
if (print_count_ == kResultsPerLine) {
|
||||||
print_count_ = 0;
|
print_count_ = 0;
|
||||||
fprintf(stdout, "\n ");
|
fprintf(stdout, "\n ");
|
||||||
|
@ -345,13 +264,98 @@ void Tester<T>::PrintTestResult(const std::string &message) {
|
||||||
print_count_++;
|
print_count_++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// Below are the non-member functions (separated because of otherwise required partial class
|
||||||
|
// template specialization)
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compares two floating point values and returns whether they are within an acceptable error
|
||||||
|
// margin. This replaces GTest's EXPECT_NEAR().
|
||||||
|
template <typename T>
|
||||||
|
bool TestSimilarity(const T val1, const T val2) {
|
||||||
|
const auto difference = std::fabs(val1 - val2);
|
||||||
|
|
||||||
|
// Set the allowed error margin for floating-point comparisons
|
||||||
|
constexpr auto kErrorMarginRelative = 1.0e-2;
|
||||||
|
constexpr auto kErrorMarginAbsolute = 1.0e-10;
|
||||||
|
|
||||||
|
// Shortcut, handles infinities
|
||||||
|
if (val1 == val2) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// The values are zero or very small: the relative error is less meaningful
|
||||||
|
else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
|
||||||
|
return (difference < static_cast<T>(kErrorMarginAbsolute));
|
||||||
|
}
|
||||||
|
// Use relative error
|
||||||
|
else {
|
||||||
|
const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
|
||||||
|
return (difference / absolute_sum) < static_cast<T>(kErrorMarginRelative);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compiles the default case for non-complex data-types
|
||||||
|
template bool TestSimilarity<float>(const float, const float);
|
||||||
|
template bool TestSimilarity<double>(const double, const double);
|
||||||
|
|
||||||
|
// Specialisations for complex data-types
|
||||||
|
template <>
|
||||||
|
bool TestSimilarity(const float2 val1, const float2 val2) {
|
||||||
|
auto real = TestSimilarity(val1.real(), val2.real());
|
||||||
|
auto imag = TestSimilarity(val1.imag(), val2.imag());
|
||||||
|
return (real && imag);
|
||||||
|
}
|
||||||
|
template <>
|
||||||
|
bool TestSimilarity(const double2 val1, const double2 val2) {
|
||||||
|
auto real = TestSimilarity(val1.real(), val2.real());
|
||||||
|
auto imag = TestSimilarity(val1.imag(), val2.imag());
|
||||||
|
return (real && imag);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
|
||||||
|
// routines. This function is specialised for the different data-types.
|
||||||
|
template <> const std::vector<float> GetExampleScalars(const bool full_test) {
|
||||||
|
if (full_test) { return {0.0f, 1.0f, 3.14f}; }
|
||||||
|
else { return {3.14f}; }
|
||||||
|
}
|
||||||
|
template <> const std::vector<double> GetExampleScalars(const bool full_test) {
|
||||||
|
if (full_test) { return {0.0, 1.0, 3.14}; }
|
||||||
|
else { return {3.14}; }
|
||||||
|
}
|
||||||
|
template <> const std::vector<float2> GetExampleScalars(const bool full_test) {
|
||||||
|
if (full_test) { return {{0.0f, 0.0f}, {1.0f, 1.3f}, {2.42f, 3.14f}}; }
|
||||||
|
else { return {{2.42f, 3.14f}}; }
|
||||||
|
}
|
||||||
|
template <> const std::vector<double2> GetExampleScalars(const bool full_test) {
|
||||||
|
if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
|
||||||
|
else { return {{2.42, 3.14}}; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Returns false is this precision is not supported by the device
|
||||||
|
template <> bool PrecisionSupported<float>(const Device &) { return true; }
|
||||||
|
template <> bool PrecisionSupported<float2>(const Device &) { return true; }
|
||||||
|
template <> bool PrecisionSupported<double>(const Device &device) {
|
||||||
|
auto extensions = device.Extensions();
|
||||||
|
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
|
||||||
|
}
|
||||||
|
template <> bool PrecisionSupported<double2>(const Device &device) {
|
||||||
|
auto extensions = device.Extensions();
|
||||||
|
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
|
||||||
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Compiles the templated class
|
// Compiles the templated class
|
||||||
template class Tester<float>;
|
template class Tester<float, float>;
|
||||||
template class Tester<double>;
|
template class Tester<double, double>;
|
||||||
template class Tester<float2>;
|
template class Tester<float2, float2>;
|
||||||
template class Tester<double2>;
|
template class Tester<double2, double2>;
|
||||||
|
template class Tester<float2, float>;
|
||||||
|
template class Tester<double2, double>;
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
} // namespace clblast
|
} // namespace clblast
|
||||||
|
|
|
@ -10,6 +10,8 @@
|
||||||
// This file implements the Tester class, providing a test-framework. GTest was used before, but
|
// This file implements the Tester class, providing a test-framework. GTest was used before, but
|
||||||
// was not able to handle certain cases (e.g. template type + parameters). This is its (basic)
|
// was not able to handle certain cases (e.g. template type + parameters). This is its (basic)
|
||||||
// custom replacement.
|
// custom replacement.
|
||||||
|
// Typename T: the data-type of the routine's memory buffers (==precision)
|
||||||
|
// Typename U: the data-type of the alpha and beta arguments
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -30,7 +32,7 @@ namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// See comment at top of file for a description of the class
|
// See comment at top of file for a description of the class
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
class Tester {
|
class Tester {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -43,10 +45,6 @@ class Tester {
|
||||||
// Error percentage is not applicable: error was caused by an incorrect status
|
// Error percentage is not applicable: error was caused by an incorrect status
|
||||||
static constexpr auto kStatusError = -1.0f;
|
static constexpr auto kStatusError = -1.0f;
|
||||||
|
|
||||||
// Set the allowed error margin for floating-point comparisons
|
|
||||||
static constexpr auto kErrorMarginRelative = 1.0e-2;
|
|
||||||
static constexpr auto kErrorMarginAbsolute = 1.0e-10;
|
|
||||||
|
|
||||||
// Constants holding start and end strings for terminal-output in colour
|
// Constants holding start and end strings for terminal-output in colour
|
||||||
const std::string kPrintError{"\x1b[31m"};
|
const std::string kPrintError{"\x1b[31m"};
|
||||||
const std::string kPrintSuccess{"\x1b[32m"};
|
const std::string kPrintSuccess{"\x1b[32m"};
|
||||||
|
@ -62,16 +60,12 @@ class Tester {
|
||||||
const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
|
const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
|
||||||
const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};
|
const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};
|
||||||
|
|
||||||
// The layouts and transpose-options to test with
|
|
||||||
static const std::vector<Layout> kLayouts;
|
|
||||||
static const std::vector<Transpose> kTransposes;
|
|
||||||
|
|
||||||
// This structure combines the above log-entry with a status code an error percentage
|
// This structure combines the above log-entry with a status code an error percentage
|
||||||
struct ErrorLogEntry {
|
struct ErrorLogEntry {
|
||||||
StatusCode status_expect;
|
StatusCode status_expect;
|
||||||
StatusCode status_found;
|
StatusCode status_found;
|
||||||
float error_percentage;
|
float error_percentage;
|
||||||
Arguments<T> args;
|
Arguments<U> args;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Creates an instance of the tester, running on a particular OpenCL platform and device. It
|
// Creates an instance of the tester, running on a particular OpenCL platform and device. It
|
||||||
|
@ -84,25 +78,13 @@ class Tester {
|
||||||
void TestStart(const std::string &test_name, const std::string &test_configuration);
|
void TestStart(const std::string &test_name, const std::string &test_configuration);
|
||||||
void TestEnd();
|
void TestEnd();
|
||||||
|
|
||||||
// Compares two floating point values for similarity. Allows for a certain relative error margin.
|
|
||||||
static bool TestSimilarity(const T val1, const T val2);
|
|
||||||
|
|
||||||
// Tests either an error count (should be zero) or two error codes (must match)
|
// Tests either an error count (should be zero) or two error codes (must match)
|
||||||
void TestErrorCount(const size_t errors, const size_t size, const Arguments<T> &args);
|
void TestErrorCount(const size_t errors, const size_t size, const Arguments<U> &args);
|
||||||
void TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
|
void TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
|
||||||
const Arguments<T> &args);
|
const Arguments<U> &args);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
// Retrieves a list of example scalars of the right type
|
|
||||||
const std::vector<T> GetExampleScalars();
|
|
||||||
|
|
||||||
// Retrieves a list of offset values to test
|
|
||||||
const std::vector<size_t> GetOffsets();
|
|
||||||
|
|
||||||
// Returns false is this precision is not supported by the device
|
|
||||||
bool PrecisionSupported() const;
|
|
||||||
|
|
||||||
// The help-message
|
// The help-message
|
||||||
std::string help_;
|
std::string help_;
|
||||||
|
|
||||||
|
@ -112,6 +94,12 @@ class Tester {
|
||||||
Context context_;
|
Context context_;
|
||||||
CommandQueue queue_;
|
CommandQueue queue_;
|
||||||
|
|
||||||
|
// Whether or not to run the full test-suite or just a smoke test
|
||||||
|
bool full_test_;
|
||||||
|
|
||||||
|
// Retrieves the offset values to test with
|
||||||
|
const std::vector<size_t> GetOffsets() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
// Internal methods to report a passed, skipped, or failed test
|
// Internal methods to report a passed, skipped, or failed test
|
||||||
|
@ -122,9 +110,6 @@ class Tester {
|
||||||
// Prints the error or success symbol to screen
|
// Prints the error or success symbol to screen
|
||||||
void PrintTestResult(const std::string &message);
|
void PrintTestResult(const std::string &message);
|
||||||
|
|
||||||
// Whether or not to run the full test-suite or just a smoke test
|
|
||||||
bool full_test_;
|
|
||||||
|
|
||||||
// Logging and counting occurrences of errors
|
// Logging and counting occurrences of errors
|
||||||
std::vector<ErrorLogEntry> error_log_;
|
std::vector<ErrorLogEntry> error_log_;
|
||||||
size_t num_passed_;
|
size_t num_passed_;
|
||||||
|
@ -143,6 +128,25 @@ class Tester {
|
||||||
std::vector<std::string> options_;
|
std::vector<std::string> options_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// Below are the non-member functions (separated because of otherwise required partial class
|
||||||
|
// template specialization)
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compares two floating point values and returns whether they are within an acceptable error
|
||||||
|
// margin. This replaces GTest's EXPECT_NEAR().
|
||||||
|
template <typename T>
|
||||||
|
bool TestSimilarity(const T val1, const T val2);
|
||||||
|
|
||||||
|
// Retrieves a list of example scalar values, used for the alpha and beta arguments for the various
|
||||||
|
// routines. This function is specialised for the different data-types.
|
||||||
|
template <typename T>
|
||||||
|
const std::vector<T> GetExampleScalars(const bool full_test);
|
||||||
|
|
||||||
|
// Returns false is this precision is not supported by the device
|
||||||
|
template <typename T>
|
||||||
|
bool PrecisionSupported(const Device &device);
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
} // namespace clblast
|
} // namespace clblast
|
||||||
|
|
||||||
|
|
|
@ -1,176 +0,0 @@
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
|
||||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
||||||
// width of 100 characters per line.
|
|
||||||
//
|
|
||||||
// Author(s):
|
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
||||||
//
|
|
||||||
// This file implements the TestXY class (see the header for information about the class).
|
|
||||||
//
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#include "correctness/testxy.h"
|
|
||||||
|
|
||||||
namespace clblast {
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Constructor, initializes the base class tester and input data
|
|
||||||
template <typename T>
|
|
||||||
TestXY<T>::TestXY(int argc, char *argv[], const bool silent,
|
|
||||||
const std::string &name, const std::vector<std::string> &options,
|
|
||||||
const Routine clblast_lambda, const Routine clblas_lambda):
|
|
||||||
Tester<T>{argc, argv, silent, name, options},
|
|
||||||
clblast_lambda_(clblast_lambda),
|
|
||||||
clblas_lambda_(clblas_lambda) {
|
|
||||||
|
|
||||||
// Computes the maximum sizes. This allows for a single set of input/output buffers.
|
|
||||||
auto max_dim = *std::max_element(kVectorDims.begin(), kVectorDims.end());
|
|
||||||
auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
|
|
||||||
auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
|
|
||||||
|
|
||||||
// Creates test input data
|
|
||||||
x_source_.resize(max_dim*max_inc + max_offset);
|
|
||||||
y_source_.resize(max_dim*max_inc + max_offset);
|
|
||||||
PopulateVector(x_source_);
|
|
||||||
PopulateVector(y_source_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ===============================================================================================
|
|
||||||
|
|
||||||
// Tests the routine for a wide variety of parameters
|
|
||||||
template <typename T>
|
|
||||||
void TestXY<T>::TestRegular(Arguments<T> &args, const std::string &name) {
|
|
||||||
if (!PrecisionSupported()) { return; }
|
|
||||||
TestStart("regular behaviour", name);
|
|
||||||
|
|
||||||
// Iterates over the vector dimension
|
|
||||||
for (auto &n: kVectorDims) {
|
|
||||||
args.n = n;
|
|
||||||
|
|
||||||
// Iterates over the increment-values and the offsets
|
|
||||||
for (auto &x_inc: kIncrements) {
|
|
||||||
args.x_inc = x_inc;
|
|
||||||
for (auto &x_offset: kOffsets) {
|
|
||||||
args.x_offset = x_offset;
|
|
||||||
for (auto &y_inc: kIncrements) {
|
|
||||||
args.y_inc = y_inc;
|
|
||||||
for (auto &y_offset: kOffsets) {
|
|
||||||
args.y_offset = y_offset;
|
|
||||||
|
|
||||||
// Computes the buffer sizes
|
|
||||||
auto x_size = n * x_inc + x_offset;
|
|
||||||
auto y_size = n * y_inc + y_offset;
|
|
||||||
if (x_size < 1 || y_size < 1) { continue; }
|
|
||||||
|
|
||||||
// Creates the OpenCL buffers
|
|
||||||
auto x_vec = Buffer(context_, CL_MEM_READ_WRITE, x_size*sizeof(T));
|
|
||||||
auto r_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
|
|
||||||
auto s_vec = Buffer(context_, CL_MEM_READ_WRITE, y_size*sizeof(T));
|
|
||||||
|
|
||||||
// Iterates over the values for alpha
|
|
||||||
for (auto &alpha: kAlphaValues) {
|
|
||||||
args.alpha = alpha;
|
|
||||||
|
|
||||||
// Runs the reference clBLAS code
|
|
||||||
x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
|
|
||||||
r_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
|
|
||||||
auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
|
|
||||||
|
|
||||||
// Runs the CLBlast code
|
|
||||||
x_vec.WriteBuffer(queue_, x_size*sizeof(T), x_source_);
|
|
||||||
s_vec.WriteBuffer(queue_, y_size*sizeof(T), y_source_);
|
|
||||||
auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
|
|
||||||
|
|
||||||
// Tests for equality of the two status codes
|
|
||||||
if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
|
|
||||||
TestErrorCodes(status1, status2, args);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Downloads the results
|
|
||||||
std::vector<T> r_result(y_size, static_cast<T>(0));
|
|
||||||
std::vector<T> s_result(y_size, static_cast<T>(0));
|
|
||||||
r_vec.ReadBuffer(queue_, y_size*sizeof(T), r_result);
|
|
||||||
s_vec.ReadBuffer(queue_, y_size*sizeof(T), s_result);
|
|
||||||
|
|
||||||
// Checks for differences in the output
|
|
||||||
auto errors = size_t{0};
|
|
||||||
for (auto idn=size_t{0}; idn<n; ++idn) {
|
|
||||||
auto index = idn*y_inc + y_offset;
|
|
||||||
if (!TestSimilarity(r_result[index], s_result[index])) {
|
|
||||||
errors++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tests the error count (should be zero)
|
|
||||||
TestErrorCount(errors, n, args);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
TestEnd();
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Tests the routine for cases with invalid OpenCL memory buffer sizes. Tests only on return-types,
|
|
||||||
// does not test for results (if any).
|
|
||||||
template <typename T>
|
|
||||||
void TestXY<T>::TestInvalidBufferSizes(Arguments<T> &args, const std::string &name) {
|
|
||||||
if (!PrecisionSupported()) { return; }
|
|
||||||
TestStart("invalid buffer sizes", name);
|
|
||||||
|
|
||||||
// Sets example test parameters
|
|
||||||
args.n = kBufferSize;
|
|
||||||
args.x_offset = 0;
|
|
||||||
args.y_offset = 0;
|
|
||||||
|
|
||||||
// Iterates over test buffer sizes
|
|
||||||
const std::vector<size_t> kBufferSizes = {0, kBufferSize - 1, kBufferSize};
|
|
||||||
for (auto &x_size: kBufferSizes) {
|
|
||||||
for (auto &y_size: kBufferSizes) {
|
|
||||||
|
|
||||||
// Iterates over test increments
|
|
||||||
for (auto &x_inc: kInvalidIncrements) {
|
|
||||||
args.x_inc = x_inc;
|
|
||||||
for (auto &y_inc: kInvalidIncrements) {
|
|
||||||
args.y_inc = y_inc;
|
|
||||||
|
|
||||||
// Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
|
|
||||||
// want to be able to create invalid buffers (no error checking here).
|
|
||||||
auto x = clCreateBuffer(context_(), CL_MEM_READ_WRITE, x_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto x_vec = Buffer(x);
|
|
||||||
auto r = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto r_vec = Buffer(r);
|
|
||||||
auto s = clCreateBuffer(context_(), CL_MEM_READ_WRITE, y_size*sizeof(T), nullptr, nullptr);
|
|
||||||
auto s_vec = Buffer(s);
|
|
||||||
|
|
||||||
// Runs the two routines
|
|
||||||
auto status1 = clblas_lambda_(args, x_vec, r_vec, queue_);
|
|
||||||
auto status2 = clblast_lambda_(args, x_vec, s_vec, queue_);
|
|
||||||
|
|
||||||
// Tests for equality of the two status codes
|
|
||||||
TestErrorCodes(status1, status2, args);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
TestEnd();
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Compiles the templated class
|
|
||||||
template class TestXY<float>;
|
|
||||||
template class TestXY<double>;
|
|
||||||
template class TestXY<float2>;
|
|
||||||
template class TestXY<double2>;
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
} // namespace clblast
|
|
|
@ -1,84 +0,0 @@
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
// This file is part of the CLBlast project. The project is licensed under the MIT license. This
|
|
||||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
||||||
// width of 100 characters per line.
|
|
||||||
//
|
|
||||||
// Author(s):
|
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
||||||
//
|
|
||||||
// This file tests any vector-vector (X,Y) routine. It contains two types of tests: one testing
|
|
||||||
// all sorts of input combinations, and one deliberatly testing with invalid values.
|
|
||||||
//
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
#ifndef CLBLAST_TEST_CORRECTNESS_TESTXY_H_
|
|
||||||
#define CLBLAST_TEST_CORRECTNESS_TESTXY_H_
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "correctness/tester.h"
|
|
||||||
|
|
||||||
namespace clblast {
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// See comment at top of file for a description of the class
|
|
||||||
template <typename T>
|
|
||||||
class TestXY: public Tester<T> {
|
|
||||||
public:
|
|
||||||
|
|
||||||
// Uses several variables from the Tester class
|
|
||||||
using Tester<T>::context_;
|
|
||||||
using Tester<T>::queue_;
|
|
||||||
|
|
||||||
// Uses several helper functions from the Tester class
|
|
||||||
using Tester<T>::TestStart;
|
|
||||||
using Tester<T>::TestEnd;
|
|
||||||
using Tester<T>::TestSimilarity;
|
|
||||||
using Tester<T>::TestErrorCount;
|
|
||||||
using Tester<T>::TestErrorCodes;
|
|
||||||
using Tester<T>::GetExampleScalars;
|
|
||||||
using Tester<T>::GetOffsets;
|
|
||||||
using Tester<T>::PrecisionSupported;
|
|
||||||
|
|
||||||
// Test settings for the regular test. Append to this list in case more tests are required.
|
|
||||||
const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
|
|
||||||
const std::vector<size_t> kOffsets = GetOffsets();
|
|
||||||
const std::vector<size_t> kIncrements = { 1, 2, 7 };
|
|
||||||
const std::vector<T> kAlphaValues = GetExampleScalars();
|
|
||||||
|
|
||||||
// Test settings for the invalid test
|
|
||||||
const std::vector<size_t> kInvalidIncrements = { 0, 1 };
|
|
||||||
const size_t kBufferSize = 512;
|
|
||||||
|
|
||||||
// Shorthand for a BLAS routine
|
|
||||||
using Routine = std::function<StatusCode(const Arguments<T>&,
|
|
||||||
const Buffer&, const Buffer&,
|
|
||||||
CommandQueue&)>;
|
|
||||||
|
|
||||||
// Constructor, initializes the base class tester and input data
|
|
||||||
TestXY(int argc, char *argv[], const bool silent,
|
|
||||||
const std::string &name, const std::vector<std::string> &options,
|
|
||||||
const Routine clblast_lambda, const Routine clblas_lambda);
|
|
||||||
|
|
||||||
// The test functions, taking no inputs
|
|
||||||
void TestRegular(Arguments<T> &args, const std::string &name);
|
|
||||||
void TestInvalidBufferSizes(Arguments<T> &args, const std::string &name);
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
// Source data to test with
|
|
||||||
std::vector<T> x_source_;
|
|
||||||
std::vector<T> y_source_;
|
|
||||||
|
|
||||||
// The routines to test
|
|
||||||
Routine clblast_lambda_;
|
|
||||||
Routine clblas_lambda_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
} // namespace clblast
|
|
||||||
|
|
||||||
// CLBLAST_TEST_CORRECTNESS_TESTXY_H_
|
|
||||||
#endif
|
|
|
@ -21,249 +21,36 @@
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// This is the vector-vector variant of the set-up/tear-down client routine.
|
// Constructor
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
|
Client<T,U>::Client(const Routine run_routine, const Routine run_reference,
|
||||||
const std::vector<std::string> &options) {
|
const std::vector<std::string> &options,
|
||||||
|
const GetMetric get_flops, const GetMetric get_bytes):
|
||||||
// Function to determine how to find the default value of the leading dimension of matrix A.
|
run_routine_(run_routine),
|
||||||
// Note: this is not relevant for this client but given anyway.
|
run_reference_(run_reference),
|
||||||
auto default_ld_a = [](const Arguments<T> args) { return args.n; };
|
options_(options),
|
||||||
|
get_flops_(get_flops),
|
||||||
// Simple command line argument parser with defaults
|
get_bytes_(get_bytes) {
|
||||||
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
|
|
||||||
if (args.print_help) { return; }
|
|
||||||
|
|
||||||
// Prints the header of the output table
|
|
||||||
PrintTableHeader(args.silent, options);
|
|
||||||
|
|
||||||
// Initializes OpenCL and the libraries
|
|
||||||
auto platform = Platform(args.platform_id);
|
|
||||||
auto device = Device(platform, kDeviceType, args.device_id);
|
|
||||||
auto context = Context(device);
|
|
||||||
auto queue = CommandQueue(context, device);
|
|
||||||
if (args.compare_clblas) { clblasSetup(); }
|
|
||||||
|
|
||||||
// Iterates over all "num_step" values jumping by "step" each time
|
|
||||||
auto s = size_t{0};
|
|
||||||
while(true) {
|
|
||||||
|
|
||||||
// Computes the data sizes
|
|
||||||
auto x_size = args.n*args.x_inc + args.x_offset;
|
|
||||||
auto y_size = args.n*args.y_inc + args.y_offset;
|
|
||||||
|
|
||||||
// Populates input host vectors with random data
|
|
||||||
std::vector<T> x_source(x_size);
|
|
||||||
std::vector<T> y_source(y_size);
|
|
||||||
PopulateVector(x_source);
|
|
||||||
PopulateVector(y_source);
|
|
||||||
|
|
||||||
// Creates the vectors on the device
|
|
||||||
auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
|
|
||||||
auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
|
|
||||||
x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
|
|
||||||
y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
|
|
||||||
|
|
||||||
// Runs the routine-specific code
|
|
||||||
client_routine(args, x_buffer, y_buffer, queue);
|
|
||||||
|
|
||||||
// Makes the jump to the next step
|
|
||||||
++s;
|
|
||||||
if (s >= args.num_steps) { break; }
|
|
||||||
args.n += args.step;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Cleans-up and returns
|
|
||||||
if (args.compare_clblas) { clblasTeardown(); }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compiles the above function
|
|
||||||
template void ClientXY<float>(int, char **, Routine2<float>, const std::vector<std::string>&);
|
|
||||||
template void ClientXY<double>(int, char **, Routine2<double>, const std::vector<std::string>&);
|
|
||||||
template void ClientXY<float2>(int, char **, Routine2<float2>, const std::vector<std::string>&);
|
|
||||||
template void ClientXY<double2>(int, char **, Routine2<double2>, const std::vector<std::string>&);
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// This is the matrix-vector-vector variant of the set-up/tear-down client routine.
|
|
||||||
template <typename T>
|
|
||||||
void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
|
|
||||||
const std::vector<std::string> &options) {
|
|
||||||
|
|
||||||
// Function to determine how to find the default value of the leading dimension of matrix A
|
|
||||||
auto default_ld_a = [](const Arguments<T> args) { return args.n; };
|
|
||||||
|
|
||||||
// Simple command line argument parser with defaults
|
|
||||||
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
|
|
||||||
if (args.print_help) { return; }
|
|
||||||
|
|
||||||
// Prints the header of the output table
|
|
||||||
PrintTableHeader(args.silent, options);
|
|
||||||
|
|
||||||
// Initializes OpenCL and the libraries
|
|
||||||
auto platform = Platform(args.platform_id);
|
|
||||||
auto device = Device(platform, kDeviceType, args.device_id);
|
|
||||||
auto context = Context(device);
|
|
||||||
auto queue = CommandQueue(context, device);
|
|
||||||
if (args.compare_clblas) { clblasSetup(); }
|
|
||||||
|
|
||||||
// Iterates over all "num_step" values jumping by "step" each time
|
|
||||||
auto s = size_t{0};
|
|
||||||
while(true) {
|
|
||||||
|
|
||||||
// Computes the second dimension of the matrix taking the rotation into account
|
|
||||||
auto a_two = (args.layout == Layout::kRowMajor) ? args.m : args.n;
|
|
||||||
|
|
||||||
// Computes the vector sizes in case the matrix is transposed
|
|
||||||
auto a_transposed = (args.a_transpose == Transpose::kYes);
|
|
||||||
auto m_real = (a_transposed) ? args.n : args.m;
|
|
||||||
auto n_real = (a_transposed) ? args.m : args.n;
|
|
||||||
|
|
||||||
// Computes the data sizes
|
|
||||||
auto a_size = a_two * args.a_ld + args.a_offset;
|
|
||||||
auto x_size = n_real*args.x_inc + args.x_offset;
|
|
||||||
auto y_size = m_real*args.y_inc + args.y_offset;
|
|
||||||
|
|
||||||
// Populates input host vectors with random data
|
|
||||||
std::vector<T> a_source(a_size);
|
|
||||||
std::vector<T> x_source(x_size);
|
|
||||||
std::vector<T> y_source(y_size);
|
|
||||||
PopulateVector(a_source);
|
|
||||||
PopulateVector(x_source);
|
|
||||||
PopulateVector(y_source);
|
|
||||||
|
|
||||||
// Creates the vectors on the device
|
|
||||||
auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
|
|
||||||
auto x_buffer = Buffer(context, CL_MEM_READ_WRITE, x_size*sizeof(T));
|
|
||||||
auto y_buffer = Buffer(context, CL_MEM_READ_WRITE, y_size*sizeof(T));
|
|
||||||
a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
|
|
||||||
x_buffer.WriteBuffer(queue, x_size*sizeof(T), x_source);
|
|
||||||
y_buffer.WriteBuffer(queue, y_size*sizeof(T), y_source);
|
|
||||||
|
|
||||||
// Runs the routine-specific code
|
|
||||||
client_routine(args, a_buffer, x_buffer, y_buffer, queue);
|
|
||||||
|
|
||||||
// Makes the jump to the next step
|
|
||||||
++s;
|
|
||||||
if (s >= args.num_steps) { break; }
|
|
||||||
args.m += args.step;
|
|
||||||
args.n += args.step;
|
|
||||||
args.a_ld += args.step;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Cleans-up and returns
|
|
||||||
if (args.compare_clblas) { clblasTeardown(); }
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compiles the above function
|
|
||||||
template void ClientAXY<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
|
|
||||||
template void ClientAXY<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
|
|
||||||
template void ClientAXY<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
|
|
||||||
template void ClientAXY<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// This is the matrix-matrix-matrix variant of the set-up/tear-down client routine.
|
|
||||||
template <typename T>
|
|
||||||
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
|
|
||||||
const std::vector<std::string> &options) {
|
|
||||||
|
|
||||||
// Function to determine how to find the default value of the leading dimension of matrix A
|
|
||||||
auto default_ld_a = [](const Arguments<T> args) { return args.m; };
|
|
||||||
|
|
||||||
// Simple command line argument parser with defaults
|
|
||||||
auto args = ParseArguments<T>(argc, argv, options, default_ld_a);
|
|
||||||
if (args.print_help) { return; }
|
|
||||||
|
|
||||||
// Prints the header of the output table
|
|
||||||
PrintTableHeader(args.silent, options);
|
|
||||||
|
|
||||||
// Initializes OpenCL and the libraries
|
|
||||||
auto platform = Platform(args.platform_id);
|
|
||||||
auto device = Device(platform, kDeviceType, args.device_id);
|
|
||||||
auto context = Context(device);
|
|
||||||
auto queue = CommandQueue(context, device);
|
|
||||||
if (args.compare_clblas) { clblasSetup(); }
|
|
||||||
|
|
||||||
// Computes whether or not the matrices are transposed. Note that we assume a default of
|
|
||||||
// column-major and no-transpose. If one of them is different (but not both), then rotated
|
|
||||||
// is considered true.
|
|
||||||
auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose == Transpose::kYes) ||
|
|
||||||
(args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
|
|
||||||
auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose == Transpose::kYes) ||
|
|
||||||
(args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
|
|
||||||
auto c_rotated = (args.layout == Layout::kRowMajor);
|
|
||||||
|
|
||||||
// Iterates over all "num_step" values jumping by "step" each time
|
|
||||||
auto s = size_t{0};
|
|
||||||
while(true) {
|
|
||||||
|
|
||||||
// Computes the data sizes
|
|
||||||
auto a_two = (a_rotated) ? args.m : args.k;
|
|
||||||
auto b_two = (b_rotated) ? args.k : args.n;
|
|
||||||
auto c_two = (c_rotated) ? args.m : args.n;
|
|
||||||
auto a_size = a_two * args.a_ld + args.a_offset;
|
|
||||||
auto b_size = b_two * args.b_ld + args.b_offset;
|
|
||||||
auto c_size = c_two * args.c_ld + args.c_offset;
|
|
||||||
|
|
||||||
// Populates input host matrices with random data
|
|
||||||
std::vector<T> a_source(a_size);
|
|
||||||
std::vector<T> b_source(b_size);
|
|
||||||
std::vector<T> c_source(c_size);
|
|
||||||
PopulateVector(a_source);
|
|
||||||
PopulateVector(b_source);
|
|
||||||
PopulateVector(c_source);
|
|
||||||
|
|
||||||
// Creates the matrices on the device
|
|
||||||
auto a_buffer = Buffer(context, CL_MEM_READ_WRITE, a_size*sizeof(T));
|
|
||||||
auto b_buffer = Buffer(context, CL_MEM_READ_WRITE, b_size*sizeof(T));
|
|
||||||
auto c_buffer = Buffer(context, CL_MEM_READ_WRITE, c_size*sizeof(T));
|
|
||||||
a_buffer.WriteBuffer(queue, a_size*sizeof(T), a_source);
|
|
||||||
b_buffer.WriteBuffer(queue, b_size*sizeof(T), b_source);
|
|
||||||
c_buffer.WriteBuffer(queue, c_size*sizeof(T), c_source);
|
|
||||||
|
|
||||||
// Runs the routine-specific code
|
|
||||||
client_routine(args, a_buffer, b_buffer, c_buffer, queue);
|
|
||||||
|
|
||||||
// Makes the jump to the next step
|
|
||||||
++s;
|
|
||||||
if (s >= args.num_steps) { break; }
|
|
||||||
args.m += args.step;
|
|
||||||
args.n += args.step;
|
|
||||||
args.k += args.step;
|
|
||||||
args.a_ld += args.step;
|
|
||||||
args.b_ld += args.step;
|
|
||||||
args.c_ld += args.step;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Cleans-up and returns
|
|
||||||
if (args.compare_clblas) { clblasTeardown(); }
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compiles the above function
|
|
||||||
template void ClientABC<float>(int, char **, Routine3<float>, const std::vector<std::string>&);
|
|
||||||
template void ClientABC<double>(int, char **, Routine3<double>, const std::vector<std::string>&);
|
|
||||||
template void ClientABC<float2>(int, char **, Routine3<float2>, const std::vector<std::string>&);
|
|
||||||
template void ClientABC<double2>(int, char **, Routine3<double2>, const std::vector<std::string>&);
|
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Parses all arguments available for the CLBlast client testers. Some arguments might not be
|
// Parses all arguments available for the CLBlast client testers. Some arguments might not be
|
||||||
// applicable, but are searched for anyway to be able to create one common argument parser. All
|
// applicable, but are searched for anyway to be able to create one common argument parser. All
|
||||||
// arguments have a default value in case they are not found.
|
// arguments have a default value in case they are not found.
|
||||||
template <typename T>
|
template <typename T, typename U>
|
||||||
Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
|
Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
|
||||||
const std::function<size_t(const Arguments<T>)> default_ld_a) {
|
const GetMetric default_b_ld, const GetMetric default_c_ld) {
|
||||||
auto args = Arguments<T>{};
|
auto args = Arguments<U>{};
|
||||||
auto help = std::string{"Options given/available:\n"};
|
auto help = std::string{"Options given/available:\n"};
|
||||||
|
|
||||||
// These are the options which are not for every client: they are optional
|
// These are the options which are not for every client: they are optional
|
||||||
for (auto &o: options) {
|
for (auto &o: options_) {
|
||||||
|
|
||||||
// Data-sizes
|
// Data-sizes
|
||||||
if (o == kArgM) { args.m = args.k = GetArgument(argc, argv, help, kArgM, 512UL); }
|
if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); }
|
||||||
if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); }
|
if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); }
|
||||||
if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); }
|
if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); }
|
||||||
|
|
||||||
// Data-layouts
|
// Data-layouts
|
||||||
if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
|
if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
|
||||||
|
@ -271,6 +58,7 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
|
||||||
if (o == kArgBTransp) { args.b_transpose = GetArgument(argc, argv, help, kArgBTransp, Transpose::kNo); }
|
if (o == kArgBTransp) { args.b_transpose = GetArgument(argc, argv, help, kArgBTransp, Transpose::kNo); }
|
||||||
if (o == kArgSide) { args.side = GetArgument(argc, argv, help, kArgSide, Side::kLeft); }
|
if (o == kArgSide) { args.side = GetArgument(argc, argv, help, kArgSide, Side::kLeft); }
|
||||||
if (o == kArgTriangle) { args.triangle = GetArgument(argc, argv, help, kArgTriangle, Triangle::kUpper); }
|
if (o == kArgTriangle) { args.triangle = GetArgument(argc, argv, help, kArgTriangle, Triangle::kUpper); }
|
||||||
|
if (o == kArgDiagonal) { args.diagonal = GetArgument(argc, argv, help, kArgDiagonal, Diagonal::kUnit); }
|
||||||
|
|
||||||
// Vector arguments
|
// Vector arguments
|
||||||
if (o == kArgXInc) { args.x_inc = GetArgument(argc, argv, help, kArgXInc, size_t{1}); }
|
if (o == kArgXInc) { args.x_inc = GetArgument(argc, argv, help, kArgXInc, size_t{1}); }
|
||||||
|
@ -279,16 +67,16 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
|
||||||
if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); }
|
if (o == kArgYOffset) { args.y_offset = GetArgument(argc, argv, help, kArgYOffset, size_t{0}); }
|
||||||
|
|
||||||
// Matrix arguments
|
// Matrix arguments
|
||||||
if (o == kArgALeadDim) { args.a_ld = GetArgument(argc, argv, help, kArgALeadDim, default_ld_a(args)); }
|
if (o == kArgALeadDim) { args.a_ld = GetArgument(argc, argv, help, kArgALeadDim, default_a_ld(args)); }
|
||||||
if (o == kArgBLeadDim) { args.b_ld = GetArgument(argc, argv, help, kArgBLeadDim, args.n); }
|
if (o == kArgBLeadDim) { args.b_ld = GetArgument(argc, argv, help, kArgBLeadDim, default_b_ld(args)); }
|
||||||
if (o == kArgCLeadDim) { args.c_ld = GetArgument(argc, argv, help, kArgCLeadDim, args.n); }
|
if (o == kArgCLeadDim) { args.c_ld = GetArgument(argc, argv, help, kArgCLeadDim, default_c_ld(args)); }
|
||||||
if (o == kArgAOffset) { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }
|
if (o == kArgAOffset) { args.a_offset = GetArgument(argc, argv, help, kArgAOffset, size_t{0}); }
|
||||||
if (o == kArgBOffset) { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); }
|
if (o == kArgBOffset) { args.b_offset = GetArgument(argc, argv, help, kArgBOffset, size_t{0}); }
|
||||||
if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }
|
if (o == kArgCOffset) { args.c_offset = GetArgument(argc, argv, help, kArgCOffset, size_t{0}); }
|
||||||
|
|
||||||
// Scalar values
|
// Scalar values
|
||||||
if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>()); }
|
if (o == kArgAlpha) { args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<U>()); }
|
||||||
if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
|
if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<U>()); }
|
||||||
}
|
}
|
||||||
|
|
||||||
// These are the options common to all routines
|
// These are the options common to all routines
|
||||||
|
@ -313,16 +101,92 @@ Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::strin
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
// This is main performance tester
|
||||||
|
template <typename T, typename U>
|
||||||
|
void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {
|
||||||
|
|
||||||
|
// Prints the header of the output table
|
||||||
|
PrintTableHeader(args.silent, options_);
|
||||||
|
|
||||||
|
// Initializes OpenCL and the libraries
|
||||||
|
auto platform = Platform(args.platform_id);
|
||||||
|
auto device = Device(platform, kDeviceType, args.device_id);
|
||||||
|
auto context = Context(device);
|
||||||
|
auto queue = CommandQueue(context, device);
|
||||||
|
if (args.compare_clblas) { clblasSetup(); }
|
||||||
|
|
||||||
|
// Iterates over all "num_step" values jumping by "step" each time
|
||||||
|
auto s = size_t{0};
|
||||||
|
while(true) {
|
||||||
|
|
||||||
|
// Sets the buffer sizes (routine-specific)
|
||||||
|
set_sizes(args);
|
||||||
|
|
||||||
|
// Populates input host matrices with random data
|
||||||
|
std::vector<T> x_source(args.x_size);
|
||||||
|
std::vector<T> y_source(args.y_size);
|
||||||
|
std::vector<T> a_source(args.a_size);
|
||||||
|
std::vector<T> b_source(args.b_size);
|
||||||
|
std::vector<T> c_source(args.c_size);
|
||||||
|
PopulateVector(x_source);
|
||||||
|
PopulateVector(y_source);
|
||||||
|
PopulateVector(a_source);
|
||||||
|
PopulateVector(b_source);
|
||||||
|
PopulateVector(c_source);
|
||||||
|
|
||||||
|
// Creates the matrices on the device
|
||||||
|
auto x_vec = Buffer(context, CL_MEM_READ_WRITE, args.x_size*sizeof(T));
|
||||||
|
auto y_vec = Buffer(context, CL_MEM_READ_WRITE, args.y_size*sizeof(T));
|
||||||
|
auto a_mat = Buffer(context, CL_MEM_READ_WRITE, args.a_size*sizeof(T));
|
||||||
|
auto b_mat = Buffer(context, CL_MEM_READ_WRITE, args.b_size*sizeof(T));
|
||||||
|
auto c_mat = Buffer(context, CL_MEM_READ_WRITE, args.c_size*sizeof(T));
|
||||||
|
x_vec.WriteBuffer(queue, args.x_size*sizeof(T), x_source);
|
||||||
|
y_vec.WriteBuffer(queue, args.y_size*sizeof(T), y_source);
|
||||||
|
a_mat.WriteBuffer(queue, args.a_size*sizeof(T), a_source);
|
||||||
|
b_mat.WriteBuffer(queue, args.b_size*sizeof(T), b_source);
|
||||||
|
c_mat.WriteBuffer(queue, args.c_size*sizeof(T), c_source);
|
||||||
|
auto buffers = Buffers{x_vec, y_vec, a_mat, b_mat, c_mat};
|
||||||
|
|
||||||
|
// Runs the routines and collects the timings
|
||||||
|
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
|
||||||
|
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
|
||||||
|
|
||||||
|
// Prints the performance of both libraries
|
||||||
|
PrintTableRow(args, ms_clblast, ms_clblas);
|
||||||
|
|
||||||
|
// Makes the jump to the next step
|
||||||
|
++s;
|
||||||
|
if (s >= args.num_steps) { break; }
|
||||||
|
args.m += args.step;
|
||||||
|
args.n += args.step;
|
||||||
|
args.k += args.step;
|
||||||
|
args.a_ld += args.step;
|
||||||
|
args.b_ld += args.step;
|
||||||
|
args.c_ld += args.step;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleans-up and returns
|
||||||
|
if (args.compare_clblas) { clblasTeardown(); }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
// Creates a vector of timing results, filled with execution times of the 'main computation'. The
|
// Creates a vector of timing results, filled with execution times of the 'main computation'. The
|
||||||
// timing is performed using the milliseconds chrono functions. The function returns the minimum
|
// timing is performed using the milliseconds chrono functions. The function returns the minimum
|
||||||
// value found in the vector of timing results. The return value is in milliseconds.
|
// value found in the vector of timing results. The return value is in milliseconds.
|
||||||
double TimedExecution(const size_t num_runs, std::function<void()> main_computation) {
|
template <typename T, typename U>
|
||||||
|
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
|
||||||
|
const Buffers &buffers, CommandQueue &queue,
|
||||||
|
Routine run_blas, const std::string &library_name) {
|
||||||
auto timings = std::vector<double>(num_runs);
|
auto timings = std::vector<double>(num_runs);
|
||||||
for (auto &timing: timings) {
|
for (auto &timing: timings) {
|
||||||
auto start_time = std::chrono::steady_clock::now();
|
auto start_time = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
// Executes the main computation
|
// Executes the main computation
|
||||||
main_computation();
|
auto status = run_blas(args, buffers, queue);
|
||||||
|
if (status != StatusCode::kSuccess) {
|
||||||
|
throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
|
||||||
|
}
|
||||||
|
|
||||||
// Records and stores the end-time
|
// Records and stores the end-time
|
||||||
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
|
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
|
||||||
|
@ -334,7 +198,8 @@ double TimedExecution(const size_t num_runs, std::function<void()> main_computat
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Prints the header of the performance table
|
// Prints the header of the performance table
|
||||||
void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
|
template <typename T, typename U>
|
||||||
|
void Client<T,U>::PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
|
for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
|
||||||
fprintf(stdout, " | <-- CLBlast --> | <-- clBLAS --> |\n");
|
fprintf(stdout, " | <-- CLBlast --> | <-- clBLAS --> |\n");
|
||||||
|
@ -345,29 +210,60 @@ void PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Print a performance-result row
|
// Print a performance-result row
|
||||||
void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
|
template <typename T, typename U>
|
||||||
const bool no_abbrv, const double ms_clblast, const double ms_clblas,
|
void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblast,
|
||||||
const unsigned long long flops, const unsigned long long bytes) {
|
const double ms_clblas) {
|
||||||
|
|
||||||
|
// Creates a vector of relevant variables
|
||||||
|
auto integers = std::vector<size_t>{};
|
||||||
|
for (auto &o: options_) {
|
||||||
|
if (o == kArgM) { integers.push_back(args.m); }
|
||||||
|
if (o == kArgN) { integers.push_back(args.n); }
|
||||||
|
else if (o == kArgK) { integers.push_back(args.k); }
|
||||||
|
else if (o == kArgLayout) { integers.push_back(static_cast<size_t>(args.layout)); }
|
||||||
|
else if (o == kArgSide) { integers.push_back(static_cast<size_t>(args.side)); }
|
||||||
|
else if (o == kArgTriangle) { integers.push_back(static_cast<size_t>(args.triangle)); }
|
||||||
|
else if (o == kArgATransp) { integers.push_back(static_cast<size_t>(args.a_transpose)); }
|
||||||
|
else if (o == kArgBTransp) { integers.push_back(static_cast<size_t>(args.b_transpose)); }
|
||||||
|
else if (o == kArgDiagonal) { integers.push_back(static_cast<size_t>(args.diagonal)); }
|
||||||
|
else if (o == kArgXInc) { integers.push_back(args.x_inc); }
|
||||||
|
else if (o == kArgYInc) { integers.push_back(args.y_inc); }
|
||||||
|
else if (o == kArgXOffset) { integers.push_back(args.x_offset); }
|
||||||
|
else if (o == kArgYOffset) { integers.push_back(args.y_offset); }
|
||||||
|
else if (o == kArgALeadDim) { integers.push_back(args.a_ld); }
|
||||||
|
else if (o == kArgBLeadDim) { integers.push_back(args.b_ld); }
|
||||||
|
else if (o == kArgCLeadDim) { integers.push_back(args.c_ld); }
|
||||||
|
else if (o == kArgAOffset) { integers.push_back(args.a_offset); }
|
||||||
|
else if (o == kArgBOffset) { integers.push_back(args.b_offset); }
|
||||||
|
else if (o == kArgCOffset) { integers.push_back(args.c_offset); }
|
||||||
|
}
|
||||||
|
auto strings = std::vector<std::string>{};
|
||||||
|
for (auto &o: options_) {
|
||||||
|
if (o == kArgAlpha) { strings.push_back(ToString(args.alpha)); }
|
||||||
|
else if (o == kArgBeta) { strings.push_back(ToString(args.beta)); }
|
||||||
|
}
|
||||||
|
|
||||||
// Computes the GFLOPS and GB/s metrics
|
// Computes the GFLOPS and GB/s metrics
|
||||||
|
auto flops = get_flops_(args);
|
||||||
|
auto bytes = get_bytes_(args);
|
||||||
auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
|
auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
|
||||||
auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
|
auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
|
||||||
auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
|
auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
|
||||||
auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;
|
auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;
|
||||||
|
|
||||||
// Outputs the argument values
|
// Outputs the argument values
|
||||||
for (auto &argument: args_int) {
|
for (auto &argument: integers) {
|
||||||
if (!no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
|
if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
|
||||||
fprintf(stdout, "%8luM;", argument/(1024*1024));
|
fprintf(stdout, "%8luM;", argument/(1024*1024));
|
||||||
}
|
}
|
||||||
else if (!no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
|
else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
|
||||||
fprintf(stdout, "%8luK;", argument/1024);
|
fprintf(stdout, "%8luK;", argument/1024);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
fprintf(stdout, "%9lu;", argument);
|
fprintf(stdout, "%9lu;", argument);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto &argument: args_string) {
|
for (auto &argument: strings) {
|
||||||
fprintf(stdout, "%9s;", argument.c_str());
|
fprintf(stdout, "%9s;", argument.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -377,5 +273,15 @@ void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::s
|
||||||
ms_clblas, gflops_clblas, gbs_clblas);
|
ms_clblas, gflops_clblas, gbs_clblas);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Client<float,float>;
|
||||||
|
template class Client<double,double>;
|
||||||
|
template class Client<float2,float2>;
|
||||||
|
template class Client<double2,double2>;
|
||||||
|
template class Client<float2,float>;
|
||||||
|
template class Client<double2,double>;
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
} // namespace clblast
|
} // namespace clblast
|
||||||
|
|
|
@ -7,7 +7,14 @@
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
//
|
//
|
||||||
// This file provides common function declarations to be used with the test clients.
|
// This class implements the performance-test client. It is generic for all CLBlast routines by
|
||||||
|
// taking a number of routine-specific functions as arguments, such as how to compute buffer sizes
|
||||||
|
// or how to get the FLOPS count.
|
||||||
|
// Typename T: the data-type of the routine's memory buffers (==precision)
|
||||||
|
// Typename U: the data-type of the alpha and beta arguments
|
||||||
|
//
|
||||||
|
// This file also provides the common interface to the performance client (see the 'RunClient'
|
||||||
|
// function for details).
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -26,61 +33,71 @@
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Types of devices to consider
|
// See comment at top of file for a description of the class
|
||||||
const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
|
template <typename T, typename U>
|
||||||
|
class Client {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Types of devices to consider
|
||||||
|
const cl_device_type kDeviceType = CL_DEVICE_TYPE_ALL;
|
||||||
|
|
||||||
|
// Shorthand for the routine-specific functions passed to the tester
|
||||||
|
using Routine = std::function<StatusCode(const Arguments<U>&, const Buffers&, CommandQueue&)>;
|
||||||
|
using SetMetric = std::function<void(Arguments<U>&)>;
|
||||||
|
using GetMetric = std::function<size_t(const Arguments<U>&)>;
|
||||||
|
|
||||||
|
// The constructor
|
||||||
|
Client(const Routine run_routine, const Routine run_reference,
|
||||||
|
const std::vector<std::string> &options,
|
||||||
|
const GetMetric get_flops, const GetMetric get_bytes);
|
||||||
|
|
||||||
|
// Parses all command-line arguments, filling in the arguments structure. If no command-line
|
||||||
|
// argument is given for a particular argument, it is filled in with a default value.
|
||||||
|
Arguments<U> ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
|
||||||
|
const GetMetric default_b_ld, const GetMetric default_c_ld);
|
||||||
|
|
||||||
|
// The main client function, setting-up arguments, matrices, OpenCL buffers, etc. After set-up, it
|
||||||
|
// calls the client routines.
|
||||||
|
void PerformanceTest(Arguments<U> &args, const SetMetric set_sizes);
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
// Runs a function a given number of times and returns the execution time of the shortest instance
|
||||||
|
double TimedExecution(const size_t num_runs, const Arguments<U> &args, const Buffers &buffers,
|
||||||
|
CommandQueue &queue, Routine run_blas, const std::string &library_name);
|
||||||
|
|
||||||
|
// Prints the header of a performance-data table
|
||||||
|
void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
|
||||||
|
|
||||||
|
// Prints a row of performance data, including results of two libraries
|
||||||
|
void PrintTableRow(const Arguments<U>& args, const double ms_clblast, const double ms_clblas);
|
||||||
|
|
||||||
|
// The routine-specific functions passed to the tester
|
||||||
|
const Routine run_routine_;
|
||||||
|
const Routine run_reference_;
|
||||||
|
const std::vector<std::string> options_;
|
||||||
|
const GetMetric get_flops_;
|
||||||
|
const GetMetric get_bytes_;
|
||||||
|
};
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Shorthand for a BLAS routine with 2 or 3 OpenCL buffers as argument
|
// The interface to the performance client. This is a separate function in the header such that it
|
||||||
template <typename T>
|
// is automatically compiled for each routine, templated by the parameter "C".
|
||||||
using Routine2 = std::function<void(const Arguments<T>&,
|
template <typename C, typename T, typename U>
|
||||||
const Buffer&, const Buffer&,
|
void RunClient(int argc, char *argv[]) {
|
||||||
CommandQueue&)>;
|
|
||||||
template <typename T>
|
|
||||||
using Routine3 = std::function<void(const Arguments<T>&,
|
|
||||||
const Buffer&, const Buffer&, const Buffer&,
|
|
||||||
CommandQueue&)>;
|
|
||||||
|
|
||||||
// =================================================================================================
|
// Creates a new client
|
||||||
|
auto client = Client<T,U>(C::RunRoutine, C::RunReference, C::GetOptions(),
|
||||||
|
C::GetFlops, C::GetBytes);
|
||||||
|
|
||||||
// These are the main client functions, setting-up arguments, matrices, OpenCL buffers, etc. After
|
// Simple command line argument parser with defaults
|
||||||
// set-up, they call the client routine, passed as argument to this function.
|
auto args = client.ParseArguments(argc, argv, C::DefaultLDA, C::DefaultLDB, C::DefaultLDC);
|
||||||
template <typename T>
|
if (args.print_help) { return; }
|
||||||
void ClientXY(int argc, char *argv[], Routine2<T> client_routine,
|
|
||||||
const std::vector<std::string> &options);
|
|
||||||
template <typename T>
|
|
||||||
void ClientAXY(int argc, char *argv[], Routine3<T> client_routine,
|
|
||||||
const std::vector<std::string> &options);
|
|
||||||
template <typename T>
|
|
||||||
void ClientABC(int argc, char *argv[], Routine3<T> client_routine,
|
|
||||||
const std::vector<std::string> &options);
|
|
||||||
|
|
||||||
// =================================================================================================
|
// Runs the client
|
||||||
|
client.PerformanceTest(args, C::SetSizes);
|
||||||
// Parses all command-line arguments, filling in the arguments structure. If no command-line
|
}
|
||||||
// argument is given for a particular argument, it is filled in with a default value.
|
|
||||||
template <typename T>
|
|
||||||
Arguments<T> ParseArguments(int argc, char *argv[], const std::vector<std::string> &options,
|
|
||||||
const std::function<size_t(const Arguments<T>)> default_ld_a);
|
|
||||||
|
|
||||||
// Retrieves only the precision command-line argument, since the above function is templated based
|
|
||||||
// on the precision
|
|
||||||
Precision GetPrecision(int argc, char *argv[]);
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Runs a function a given number of times and returns the execution time of the shortest instance
|
|
||||||
double TimedExecution(const size_t num_runs, std::function<void()> main_computation);
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Prints the header of a performance-data table
|
|
||||||
void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
|
|
||||||
|
|
||||||
// Prints a row of performance data, including results of two libraries
|
|
||||||
void PrintTableRow(const std::vector<size_t> &args_int, const std::vector<std::string> &args_string,
|
|
||||||
const bool abbreviations, const double ms_clblast, const double ms_clblas,
|
|
||||||
const unsigned long long flops, const unsigned long long bytes);
|
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
} // namespace clblast
|
} // namespace clblast
|
||||||
|
|
|
@ -83,7 +83,16 @@ main <- function(routine_name, precision, test_names, test_values,
|
||||||
params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
|
params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
|
||||||
arguments <- paste(devices_string, params_string, options_string, sep=" ")
|
arguments <- paste(devices_string, params_string, options_string, sep=" ")
|
||||||
print(paste("Running", executable, arguments, sep=" "))
|
print(paste("Running", executable, arguments, sep=" "))
|
||||||
result_string <- system2(command=executable, args=arguments, stdout=TRUE)
|
raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
|
||||||
|
|
||||||
|
# Filter the string: only lines containing a ";" can be valid lines
|
||||||
|
result_string <- c()
|
||||||
|
for (line in raw_result_string) {
|
||||||
|
if (grepl(";",line)) {
|
||||||
|
result_string <-
|
||||||
|
c(result_string, line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# Reads the result into a dataframe
|
# Reads the result into a dataframe
|
||||||
command_db <- read.csv(text=result_string, sep=";")
|
command_db <- read.csv(text=result_string, sep=";")
|
||||||
|
|
|
@ -35,10 +35,10 @@ test_names <- list(
|
||||||
|
|
||||||
# Defines the test-cases
|
# Defines the test-cases
|
||||||
test_values <- list(
|
test_values <- list(
|
||||||
list(c(128, 128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
|
list(c( 128, 128, 128, 1, 0, 0, 16, 128, num_runs, precision)),
|
||||||
list(c(129, 129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
|
list(c( 129, 129, 129, 1, 0, 0, 16, 128, num_runs, precision)),
|
||||||
list(c(512, 512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
|
list(c( 512, 512, 512, 1, 0, 0, 16, 1, num_runs, precision)),
|
||||||
list(c(2048, 2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
|
list(c(2048, 2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
|
||||||
list(
|
list(
|
||||||
c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(1024, 1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
|
c(1024, 1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
|
||||||
|
@ -50,17 +50,17 @@ test_values <- list(
|
||||||
c(1024, 1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
|
c(1024, 1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
|
||||||
),
|
),
|
||||||
list(
|
list(
|
||||||
c(8, 8, 8, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 8, 8, 8, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(16, 16, 16, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 16, 16, 16, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(32, 32, 32, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 32, 32, 32, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(64, 64, 64, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 64, 64, 64, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(128, 128, 128, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 128, 128, 128, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(256, 256, 256, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 256, 256, 256, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(512, 512, 512, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 512, 512, 512, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(1024, 1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
c(1024, 1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(2048, 2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
|
c(2048, 2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(4096, 4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
|
c(4096, 4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(8192, 8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
|
c(8192, 8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ source(file.path(dirname(thisfile), "common.r"))
|
||||||
|
|
||||||
# Settings
|
# Settings
|
||||||
routine_name <- "xsymm"
|
routine_name <- "xsymm"
|
||||||
parameters <- c("-m","-n","-layout","-triangle","-side",
|
parameters <- c("-m","-n","-layout","-side","-triangle",
|
||||||
"-num_steps","-step","-runs","-precision")
|
"-num_steps","-step","-runs","-precision")
|
||||||
precision <- 32
|
precision <- 32
|
||||||
|
|
||||||
|
@ -29,16 +29,16 @@ test_names <- list(
|
||||||
"multiples of 128 (+1)",
|
"multiples of 128 (+1)",
|
||||||
"around m=n=512",
|
"around m=n=512",
|
||||||
"around m=n=2048",
|
"around m=n=2048",
|
||||||
"layouts and triangle/side (m=n=1024)",
|
"layouts and side/triangle (m=n=1024)",
|
||||||
"powers of 2"
|
"powers of 2"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Defines the test-cases
|
# Defines the test-cases
|
||||||
test_values <- list(
|
test_values <- list(
|
||||||
list(c(128, 128, 0, 0, 0, 16, 128, num_runs, precision)),
|
list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)),
|
||||||
list(c(129, 129, 0, 0, 0, 16, 128, num_runs, precision)),
|
list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)),
|
||||||
list(c(512, 512, 0, 0, 0, 16, 1, num_runs, precision)),
|
list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)),
|
||||||
list(c(2048, 2048, 0, 0, 0, 16, 1, num_runs, precision)),
|
list(c(2048, 2048, 1, 0, 0, 16, 1, num_runs, precision)),
|
||||||
list(
|
list(
|
||||||
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
|
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
|
||||||
|
@ -50,17 +50,17 @@ test_values <- list(
|
||||||
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
|
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
|
||||||
),
|
),
|
||||||
list(
|
list(
|
||||||
c(8, 8, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(16, 16, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(32, 32, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(64, 64, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(128, 128, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(256, 256, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(512, 512, 0, 0, 0, 1, 0, num_runs, precision),
|
c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(2048, 2048, 0, 0, 0, 1, 0, num_runs, precision),
|
c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(4096, 4096, 0, 0, 0, 1, 0, num_runs, precision),
|
c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
c(8192, 8192, 0, 0, 0, 1, 0, num_runs, precision)
|
c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ test_xlabels <- list(
|
||||||
"matrix sizes (m=n)",
|
"matrix sizes (m=n)",
|
||||||
"matrix sizes (m=n)",
|
"matrix sizes (m=n)",
|
||||||
"matrix sizes (m=n)",
|
"matrix sizes (m=n)",
|
||||||
"layout (row/col), triangle (up/lo), side (l/r)",
|
"layout (row/col), side (l/r), triangle (up/lo)",
|
||||||
"matrix sizes (m=n)"
|
"matrix sizes (m=n)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -80,8 +80,8 @@ test_xaxis <- list(
|
||||||
c("m", ""),
|
c("m", ""),
|
||||||
c("m", ""),
|
c("m", ""),
|
||||||
c("m", ""),
|
c("m", ""),
|
||||||
list(1:8, c("row,up,l", "row,up,r", "row,lo,l", "row,lo,r",
|
list(1:8, c("row,l,up", "row,r,up", "row,l,lo", "row,r,lo",
|
||||||
"col,up,l", "col,up,r", "col,lo,l", "col,lo,r")),
|
"col,l,up", "col,r,up", "col,l,lo", "col,r,lo")),
|
||||||
c("m", "x")
|
c("m", "x")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
94
test/performance/graphs/xsyr2k.r
Normal file
94
test/performance/graphs/xsyr2k.r
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
|
||||||
|
# ==================================================================================================
|
||||||
|
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
|
||||||
|
#
|
||||||
|
# Author(s):
|
||||||
|
# Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
#
|
||||||
|
# This file implements the performance script for the Xsyr2k routine
|
||||||
|
#
|
||||||
|
# ==================================================================================================
|
||||||
|
|
||||||
|
# Includes the common functions
|
||||||
|
args <- commandArgs(trailingOnly = FALSE)
|
||||||
|
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
|
||||||
|
source(file.path(dirname(thisfile), "common.r"))
|
||||||
|
|
||||||
|
# ==================================================================================================
|
||||||
|
|
||||||
|
# Settings
|
||||||
|
routine_name <- "xsyr2k"
|
||||||
|
parameters <- c("-n","-k","-layout","-triangle","-transA",
|
||||||
|
"-num_steps","-step","-runs","-precision")
|
||||||
|
precision <- 32
|
||||||
|
|
||||||
|
# Sets the names of the test-cases
|
||||||
|
test_names <- list(
|
||||||
|
"multiples of 128",
|
||||||
|
"multiples of 128 (+1)",
|
||||||
|
"around n=k=512",
|
||||||
|
"around n=k=1536",
|
||||||
|
"layouts and transposing (n=k=1024)",
|
||||||
|
"powers of 2"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Defines the test-cases
|
||||||
|
test_values <- list(
|
||||||
|
list(c( 128, 128, 1, 0, 0, 16, 128, num_runs, precision)),
|
||||||
|
list(c( 129, 129, 1, 0, 0, 16, 128, num_runs, precision)),
|
||||||
|
list(c( 512, 512, 1, 0, 0, 16, 1, num_runs, precision)),
|
||||||
|
list(c(1536, 1536, 1, 0, 0, 16, 1, num_runs, precision)),
|
||||||
|
list(
|
||||||
|
c(1024, 1024, 0, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c(1024, 1024, 0, 0, 1, 1, 0, num_runs, precision),
|
||||||
|
c(1024, 1024, 0, 1, 0, 1, 0, num_runs, precision),
|
||||||
|
c(1024, 1024, 0, 1, 1, 1, 0, num_runs, precision),
|
||||||
|
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c(1024, 1024, 1, 0, 1, 1, 0, num_runs, precision),
|
||||||
|
c(1024, 1024, 1, 1, 0, 1, 0, num_runs, precision),
|
||||||
|
c(1024, 1024, 1, 1, 1, 1, 0, num_runs, precision)
|
||||||
|
),
|
||||||
|
list(
|
||||||
|
c( 8, 8, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c( 16, 16, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c( 32, 32, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c( 64, 64, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c( 128, 128, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c( 256, 256, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c( 512, 512, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c(1024, 1024, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c(2048, 2048, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c(4096, 4096, 1, 0, 0, 1, 0, num_runs, precision),
|
||||||
|
c(8192, 8192, 1, 0, 0, 1, 0, num_runs, precision)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Defines the x-labels corresponding to the test-cases
|
||||||
|
test_xlabels <- list(
|
||||||
|
"matrix sizes (n=k)",
|
||||||
|
"matrix sizes (n=k)",
|
||||||
|
"matrix sizes (n=k)",
|
||||||
|
"matrix sizes (n=k)",
|
||||||
|
"layout (row/col), triangle (u/l), transA (n/y)",
|
||||||
|
"matrix sizes (n=k)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Defines the x-axis of the test-cases
|
||||||
|
test_xaxis <- list(
|
||||||
|
c("n", ""),
|
||||||
|
c("n", ""),
|
||||||
|
c("n", ""),
|
||||||
|
c("n", ""),
|
||||||
|
list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
|
||||||
|
"col,u,n", "col,u,y", "col,l,n", "col,l,y")),
|
||||||
|
c("n", "x")
|
||||||
|
)
|
||||||
|
|
||||||
|
# ==================================================================================================
|
||||||
|
|
||||||
|
# Start the script
|
||||||
|
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
|
||||||
|
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
|
||||||
|
|
||||||
|
# ==================================================================================================
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue