mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-07-07 12:23:46 +02:00
Merge pull request #138 from CNugteren/triangular_solvers
Added the triangular solvers (TRSV/TRSM)
This commit is contained in:
commit
7de7e7d8ed
|
@ -8,6 +8,10 @@ Development version (next release)
|
|||
- Added the OverrideParameters function to the API to be able to supply custom tuning parmeters
|
||||
- Various minor fixes and enhancements
|
||||
- Added tuned parameters for various devices (see README)
|
||||
- Added level-2 routines:
|
||||
* STRSV/DTRSV/CTRSV/ZTRSV (experimental, un-optimized)
|
||||
- Added level-3 routines:
|
||||
* STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized)
|
||||
|
||||
Version 0.10.0
|
||||
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
|
||||
|
|
|
@ -156,9 +156,9 @@ if(NETLIB)
|
|||
set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
|
||||
endif()
|
||||
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
|
||||
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
|
||||
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
|
||||
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
|
||||
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
|
||||
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
|
||||
set(LEVELX_ROUTINES xomatcopy)
|
||||
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
|
||||
set(PRECISIONS 32 64 3232 6464 16)
|
||||
|
@ -175,6 +175,7 @@ set(SOURCES
|
|||
src/clblast.cpp
|
||||
src/clblast_c.cpp
|
||||
src/routine.cpp
|
||||
src/routines/levelx/xinvert.cpp # only source, don't include it as a test
|
||||
)
|
||||
if(NETLIB)
|
||||
set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
|
||||
|
|
|
@ -258,6 +258,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
|
|||
| xSPR | ✔ | ✔ | - | - | ✔ |
|
||||
| xSYR2 | ✔ | ✔ | - | - | ✔ |
|
||||
| xSPR2 | ✔ | ✔ | - | - | ✔ |
|
||||
| xTRSV | ✔ | ✔ | ✔ | ✔ | | (experimental, un-optimized)
|
||||
|
||||
| Level-3 | S | D | C | Z | H |
|
||||
| ---------|---|---|---|---|---|
|
||||
|
@ -269,6 +270,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
|
|||
| xSYR2K | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xHER2K | - | - | ✔ | ✔ | - |
|
||||
| xTRMM | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xTRSM | ✔ | ✔ | ✔ | ✔ | | (experimental, un-optimized)
|
||||
|
||||
In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:
|
||||
|
||||
|
@ -279,7 +281,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
|
|||
| IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
|
||||
Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTRSV, xTBSV, xTPSV, and xTRSM.
|
||||
Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.
|
||||
|
||||
|
||||
Half precision (fp16)
|
||||
|
|
122
doc/clblast.md
122
doc/clblast.md
|
@ -1445,6 +1445,63 @@ Arguments to TPMV:
|
|||
|
||||
|
||||
|
||||
xTRSV: Solves a triangular system of equations
|
||||
-------------
|
||||
|
||||
|
||||
|
||||
C++ API:
|
||||
```
|
||||
template <typename T>
|
||||
StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
C API:
|
||||
```
|
||||
CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to TRSV:
|
||||
|
||||
* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
|
||||
* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
|
||||
* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
|
||||
* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
|
||||
* `const size_t n`: Integer size argument. This value must be positive.
|
||||
* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
|
||||
* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
|
||||
* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
|
||||
* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
|
||||
* `const size_t x_offset`: The offset in elements from the start of the output x vector.
|
||||
* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
|
||||
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
|
||||
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
|
||||
|
||||
|
||||
|
||||
xGER: General rank-1 matrix update
|
||||
-------------
|
||||
|
||||
|
@ -2708,6 +2765,71 @@ Requirements for TRMM:
|
|||
|
||||
|
||||
|
||||
xTRSM: Solves a triangular system of equations
|
||||
-------------
|
||||
|
||||
Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.
|
||||
|
||||
C++ API:
|
||||
```
|
||||
template <typename T>
|
||||
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const T alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
C API:
|
||||
```
|
||||
CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const float alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const double alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const cl_float2 alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const cl_double2 alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to TRSM:
|
||||
|
||||
* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
|
||||
* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
|
||||
* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
|
||||
* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
|
||||
* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
|
||||
* `const size_t m`: Integer size argument. This value must be positive.
|
||||
* `const size_t n`: Integer size argument. This value must be positive.
|
||||
* `const T alpha`: Input scalar constant.
|
||||
* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
|
||||
* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
|
||||
* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
|
||||
* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
|
||||
* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
|
||||
* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
|
||||
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
|
||||
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
|
||||
|
||||
|
||||
|
||||
xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function)
|
||||
-------------
|
||||
|
||||
|
|
|
@ -587,7 +587,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
|
|||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
|
||||
template <typename T>
|
||||
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
|
|
|
@ -1265,7 +1265,7 @@ CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBl
|
|||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
|
||||
CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const float alpha,
|
||||
|
@ -1290,12 +1290,6 @@ CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBl
|
|||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
CLBlastStatusCode PUBLIC_API CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// =================================================================================================
|
||||
// Extra non-BLAS routines (level-X)
|
||||
|
|
|
@ -862,7 +862,7 @@ void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side,
|
|||
const void* a, const int a_ld,
|
||||
void* b, const int b_ld);
|
||||
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
|
||||
void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const int m, const int n,
|
||||
const float alpha,
|
||||
|
|
|
@ -41,8 +41,8 @@ FILES = [
|
|||
"/include/clblast_netlib_c.h",
|
||||
"/src/clblast_netlib_c.cpp",
|
||||
]
|
||||
HEADER_LINES = [121, 73, 125, 23, 29, 41, 65, 32]
|
||||
FOOTER_LINES = [25, 139, 27, 38, 6, 6, 9, 2]
|
||||
HEADER_LINES = [121, 75, 125, 23, 29, 41, 65, 32]
|
||||
FOOTER_LINES = [25, 138, 27, 38, 6, 6, 9, 2]
|
||||
HEADER_LINES_DOC = 0
|
||||
FOOTER_LINES_DOC = 63
|
||||
|
||||
|
@ -131,7 +131,7 @@ ROUTINES = [
|
|||
Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
|
||||
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
|
||||
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
|
||||
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []),
|
||||
Routine(True, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []),
|
||||
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
|
||||
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []),
|
||||
# Level 2: matrix update
|
||||
|
@ -156,7 +156,7 @@ ROUTINES = [
|
|||
Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
|
||||
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
|
||||
Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
|
||||
Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "", []),
|
||||
Routine(True, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []),
|
||||
],
|
||||
[ # Level X: extra routines (not part of BLAS)
|
||||
Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
|
||||
|
|
|
@ -45,6 +45,7 @@
|
|||
#include "routines/level2/xtrmv.hpp"
|
||||
#include "routines/level2/xtbmv.hpp"
|
||||
#include "routines/level2/xtpmv.hpp"
|
||||
#include "routines/level2/xtrsv.hpp"
|
||||
#include "routines/level2/xger.hpp"
|
||||
#include "routines/level2/xgeru.hpp"
|
||||
#include "routines/level2/xgerc.hpp"
|
||||
|
@ -66,6 +67,7 @@
|
|||
#include "routines/level3/xsyr2k.hpp"
|
||||
#include "routines/level3/xher2k.hpp"
|
||||
#include "routines/level3/xtrmm.hpp"
|
||||
#include "routines/level3/xtrsm.hpp"
|
||||
|
||||
// Level-x includes (non-BLAS)
|
||||
#include "routines/levelx/xomatcopy.hpp"
|
||||
|
@ -1145,12 +1147,20 @@ template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Tr
|
|||
|
||||
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
|
||||
template <typename T>
|
||||
StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*) {
|
||||
return StatusCode::kNotImplemented;
|
||||
StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
try {
|
||||
auto queue_cpp = Queue(*queue);
|
||||
auto routine = Xtrsv<T>(queue_cpp, event);
|
||||
routine.DoTrsv(layout, triangle, a_transpose, diagonal,
|
||||
n,
|
||||
Buffer<T>(a_buffer), a_offset, a_ld,
|
||||
Buffer<T>(x_buffer), x_offset, x_inc);
|
||||
return StatusCode::kSuccess;
|
||||
} catch (...) { return DispatchException(); }
|
||||
}
|
||||
template StatusCode PUBLIC_API Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t,
|
||||
|
@ -2065,15 +2075,24 @@ template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triang
|
|||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
|
||||
template <typename T>
|
||||
StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t, const size_t,
|
||||
const T,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*) {
|
||||
return StatusCode::kNotImplemented;
|
||||
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const T alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
try {
|
||||
auto queue_cpp = Queue(*queue);
|
||||
auto routine = Xtrsm<T>(queue_cpp, event);
|
||||
routine.DoTrsm(layout, side, triangle, a_transpose, diagonal,
|
||||
m, n,
|
||||
alpha,
|
||||
Buffer<T>(a_buffer), a_offset, a_ld,
|
||||
Buffer<T>(b_buffer), b_offset, b_ld);
|
||||
return StatusCode::kSuccess;
|
||||
} catch (...) { return DispatchException(); }
|
||||
}
|
||||
template StatusCode PUBLIC_API Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t, const size_t,
|
||||
|
@ -2099,12 +2118,6 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// =================================================================================================
|
||||
// Extra non-BLAS routines (level-X)
|
||||
|
@ -2159,7 +2172,6 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Clears the cache of stored binaries
|
||||
|
|
|
@ -3350,27 +3350,6 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid
|
|||
);
|
||||
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
|
||||
}
|
||||
CLBlastStatusCode CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
try {
|
||||
return static_cast<CLBlastStatusCode>(
|
||||
clblast::Trsm(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Side>(side),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
static_cast<clblast::Transpose>(a_transpose),
|
||||
static_cast<clblast::Diagonal>(diagonal),
|
||||
m, n,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
b_buffer, b_offset, b_ld,
|
||||
queue, event)
|
||||
);
|
||||
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
// Extra non-BLAS routines (level-X)
|
||||
|
|
|
@ -20,12 +20,14 @@
|
|||
#include "database/kernels/xgemv_fast.hpp"
|
||||
#include "database/kernels/xgemv_fast_rot.hpp"
|
||||
#include "database/kernels/xger.hpp"
|
||||
#include "database/kernels/xtrsv.hpp"
|
||||
#include "database/kernels/xgemm.hpp"
|
||||
#include "database/kernels/xgemm_direct.hpp"
|
||||
#include "database/kernels/copy.hpp"
|
||||
#include "database/kernels/pad.hpp"
|
||||
#include "database/kernels/transpose.hpp"
|
||||
#include "database/kernels/padtranspose.hpp"
|
||||
#include "database/kernels/invert.hpp"
|
||||
#include "database/kernel_selection.hpp"
|
||||
|
||||
namespace clblast {
|
||||
|
@ -39,12 +41,14 @@ const std::vector<const Database::DatabaseEntry*> Database::database = {
|
|||
&database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
|
||||
&database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
|
||||
&database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
|
||||
&database::XtrsvHalf, &database::XtrsvSingle, &database::XtrsvDouble, &database::XtrsvComplexSingle, &database::XtrsvComplexDouble,
|
||||
&database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
|
||||
&database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
|
||||
&database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
|
||||
&database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
|
||||
&database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
|
||||
&database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
|
||||
&database::InvertHalf, &database::InvertSingle, &database::InvertDouble, &database::InvertComplexSingle, &database::InvertComplexDouble,
|
||||
&database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
|
||||
};
|
||||
|
||||
|
|
78
src/database/kernels/invert.hpp
Normal file
78
src/database/kernels/invert.hpp
Normal file
|
@ -0,0 +1,78 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// Tuning parameters for the diagonal matrix inversion kernels
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
namespace clblast {
|
||||
namespace database {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry InvertHalf = {
|
||||
"Invert", Precision::kHalf, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry InvertSingle = {
|
||||
"Invert", Precision::kSingle, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry InvertComplexSingle = {
|
||||
"Invert", Precision::kComplexSingle, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry InvertDouble = {
|
||||
"Invert", Precision::kDouble, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry InvertComplexDouble = {
|
||||
"Invert", Precision::kComplexDouble, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace database
|
||||
} // namespace clblast
|
78
src/database/kernels/xtrsv.hpp
Normal file
78
src/database/kernels/xtrsv.hpp
Normal file
|
@ -0,0 +1,78 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file populates the database with best-found tuning parameters for the 'Xtrsv' kernels.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
namespace clblast {
|
||||
namespace database {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry XtrsvHalf = {
|
||||
"Xtrsv", Precision::kHalf, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry XtrsvSingle = {
|
||||
"Xtrsv", Precision::kSingle, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry XtrsvComplexSingle = {
|
||||
"Xtrsv", Precision::kComplexSingle, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry XtrsvDouble = {
|
||||
"Xtrsv", Precision::kDouble, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry XtrsvComplexDouble = {
|
||||
"Xtrsv", Precision::kComplexDouble, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace database
|
||||
} // namespace clblast
|
|
@ -160,6 +160,13 @@ R"(
|
|||
#define AbsoluteValue(value) value = fabs(value)
|
||||
#endif
|
||||
|
||||
// Negation (component-wise)
|
||||
#if PRECISION == 3232 || PRECISION == 6464
|
||||
#define Negate(value) value.x = -(value.x); value.y = -(value.y)
|
||||
#else
|
||||
#define Negate(value) value = -(value)
|
||||
#endif
|
||||
|
||||
// Adds two complex variables
|
||||
#if PRECISION == 3232 || PRECISION == 6464
|
||||
#define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
|
||||
|
@ -167,6 +174,13 @@ R"(
|
|||
#define Add(c, a, b) c = a + b
|
||||
#endif
|
||||
|
||||
// Subtracts two complex variables
|
||||
#if PRECISION == 3232 || PRECISION == 6464
|
||||
#define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y
|
||||
#else
|
||||
#define Subtract(c, a, b) c = a - b
|
||||
#endif
|
||||
|
||||
// Multiply two complex variables (used in the defines below)
|
||||
#if PRECISION == 3232 || PRECISION == 6464
|
||||
#define MulReal(a, b) a.x*b.x - a.y*b.y
|
||||
|
@ -191,6 +205,20 @@ R"(
|
|||
#endif
|
||||
#endif
|
||||
|
||||
// The scalar multiply-subtract function
|
||||
#if PRECISION == 3232 || PRECISION == 6464
|
||||
#define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b)
|
||||
#else
|
||||
#define MultiplySubtract(c, a, b) c -= a * b
|
||||
#endif
|
||||
|
||||
// The scalar division function: full division
|
||||
#if PRECISION == 3232 || PRECISION == 6464
|
||||
#define DivideFull(c, a, b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom
|
||||
#else
|
||||
#define DivideFull(c, a, b) c = a / b
|
||||
#endif
|
||||
|
||||
// The scalar AXPBY function
|
||||
#if PRECISION == 3232 || PRECISION == 6464
|
||||
#define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
|
||||
|
|
144
src/kernels/level2/xtrsv.opencl
Normal file
144
src/kernels/level2/xtrsv.opencl
Normal file
|
@ -0,0 +1,144 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||
R"(
|
||||
|
||||
// =================================================================================================
|
||||
#if defined(ROUTINE_TRSV)
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
|
||||
void FillVector(const int n, const int inc, const int offset,
|
||||
__global real* restrict dest, const real_arg arg_value) {
|
||||
const real value = GetRealArg(arg_value);
|
||||
const int tid = get_global_id(0);
|
||||
if (tid < n) {
|
||||
dest[tid*inc + offset] = value;
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
||||
// this kernel file is used outside of the CLBlast library.
|
||||
|
||||
#ifndef TRSV_BLOCK_SIZE
|
||||
#define TRSV_BLOCK_SIZE 32 // The block size for forward or backward substition
|
||||
#endif
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
|
||||
void trsv_forward(int n,
|
||||
const __global real *A, const int a_offset, int a_ld,
|
||||
__global real *b, const int b_offset, int b_inc,
|
||||
__global real *x, const int x_offset, int x_inc,
|
||||
const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
|
||||
__local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
|
||||
__local real xlm[TRSV_BLOCK_SIZE];
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
// Pre-loads the data into local memory
|
||||
if (tid < n) {
|
||||
Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
|
||||
if (is_transposed == 0) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
alm[i][tid] = A[i + tid*a_ld + a_offset];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
alm[i][tid] = A[tid + i*a_ld + a_offset];
|
||||
}
|
||||
}
|
||||
if (do_conjugate) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
COMPLEX_CONJUGATE(alm[i][tid]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Computes the result (single-threaded for now)
|
||||
if (tid == 0) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
for (int j = 0; j < i; ++j) {
|
||||
MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
|
||||
}
|
||||
if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Stores the results
|
||||
if (tid < n) {
|
||||
x[tid*x_inc + x_offset] = xlm[tid];
|
||||
}
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
|
||||
void trsv_backward(int n,
|
||||
const __global real *A, const int a_offset, int a_ld,
|
||||
__global real *b, const int b_offset, int b_inc,
|
||||
__global real *x, const int x_offset, int x_inc,
|
||||
const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
|
||||
__local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
|
||||
__local real xlm[TRSV_BLOCK_SIZE];
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
// Pre-loads the data into local memory
|
||||
if (tid < n) {
|
||||
Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
|
||||
if (is_transposed == 0) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
alm[i][tid] = A[i + tid*a_ld + a_offset];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
alm[i][tid] = A[tid + i*a_ld + a_offset];
|
||||
}
|
||||
}
|
||||
if (do_conjugate) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
COMPLEX_CONJUGATE(alm[i][tid]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Computes the result (single-threaded for now)
|
||||
if (tid == 0) {
|
||||
for (int i = n - 1; i >= 0; --i) {
|
||||
for (int j = i + 1; j < n; ++j) {
|
||||
MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
|
||||
}
|
||||
if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Stores the results
|
||||
if (tid < n) {
|
||||
x[tid*x_inc + x_offset] = xlm[tid];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
// =================================================================================================
|
||||
|
||||
// End of the C++11 raw string literal
|
||||
)"
|
||||
|
||||
// =================================================================================================
|
429
src/kernels/level3/invert_diagonal_blocks.opencl
Normal file
429
src/kernels/level3/invert_diagonal_blocks.opencl
Normal file
|
@ -0,0 +1,429 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based
|
||||
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
|
||||
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
|
||||
// and Jack Dongarra.
|
||||
//
|
||||
// =================================================================================================
|
||||
//
|
||||
// Let A be an block_size*block_size lower triangular matrix, and B its inverse.
|
||||
// Then the block decomposition
|
||||
//
|
||||
// [ A11 0 ] * [ B11 0 ] = [ I 0 ]
|
||||
// [ A21 A22 ] [ B21 B22 ] [ 0 I ]
|
||||
//
|
||||
// yields
|
||||
//
|
||||
// A11*B11 = I ==> B11 = A11^{-1},
|
||||
// A22*B22 = I ==> B22 = A22^{-1},
|
||||
// A21*B11 + A22*B21 = 0 ==> B21 = -A22^{-1}*A21*B11 = -B22*A21*B11.
|
||||
//
|
||||
// The InvertDiagonalBlock kernel inverts A11 and A22.
|
||||
// The TripleMatMul routines multiply:
|
||||
// part 1: B21 = A21 * B11,
|
||||
// part 2: B21 = -B22 * B21.
|
||||
//
|
||||
// At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each
|
||||
// submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2
|
||||
// at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block,
|
||||
// here current_size*2=32,
|
||||
// [ B11 0 ]
|
||||
// which contains [ B21 B22 ].
|
||||
// Outer blocks are block_size x block_size.
|
||||
//
|
||||
// A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on
|
||||
// the right. This makes a single check easy to do.
|
||||
//
|
||||
// B is stored in workspace that is a full multiple of block_size x block_size; no checks needed.
|
||||
//
|
||||
// We split this into part1 & part2 to synchronize all blocks and make sure
|
||||
// that writes to B12 are observed by all blocks.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||
R"(
|
||||
|
||||
// =================================================================================================
|
||||
#if defined(ROUTINE_INVERT)
|
||||
|
||||
#define LOCALX 17 // 16 + 1 to avoid bank conflicts
|
||||
#define LOCALY 16
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
|
||||
__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
|
||||
void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
|
||||
__global real* restrict dest, const int outer_block_size,
|
||||
const int unit_diagonal, const int is_upper)
|
||||
{
|
||||
const int thread_index = get_local_id(0);
|
||||
const int block_index = get_group_id(0);
|
||||
|
||||
// Sets the offset for this particular block in the source and destination matrices
|
||||
const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
|
||||
const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
|
||||
const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
|
||||
(block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
|
||||
|
||||
// Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
|
||||
__local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
|
||||
|
||||
// Loads the source lower triangle into local memory. Any values in the upper triangle or
|
||||
// outside of the matrix are set to zero
|
||||
#pragma unroll
|
||||
for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
|
||||
const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
|
||||
(thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
|
||||
if (condition) {
|
||||
lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
|
||||
}
|
||||
else {
|
||||
SetToZero(lm[thread_index][j]);
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Inverts the diagonal
|
||||
real inverted_diagonal;
|
||||
SetToOne(inverted_diagonal);
|
||||
if (unit_diagonal == 0) {
|
||||
const real diagonal_value = lm[thread_index][thread_index];
|
||||
if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
|
||||
real constant_one;
|
||||
SetToOne(constant_one);
|
||||
DivideFull(inverted_diagonal, constant_one, diagonal_value);
|
||||
}
|
||||
}
|
||||
lm[thread_index][thread_index] = inverted_diagonal;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Upper-triangular
|
||||
if (is_upper) {
|
||||
|
||||
// Computes the elements 0:j-1 of the j-th column
|
||||
for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) {
|
||||
if (thread_index < j) {
|
||||
real sum;
|
||||
SetToZero(sum);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < j; ++k) {
|
||||
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
|
||||
}
|
||||
real diagonal_value = lm[j][j];
|
||||
Negate(diagonal_value);
|
||||
Multiply(lm[thread_index][j], diagonal_value, sum);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
// Lower triangular
|
||||
else {
|
||||
|
||||
// Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column
|
||||
for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) {
|
||||
if (thread_index > j) {
|
||||
real sum;
|
||||
SetToZero(sum);
|
||||
#pragma unroll
|
||||
for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
|
||||
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
|
||||
}
|
||||
Multiply(lm[thread_index][j], -lm[j][j], sum);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
// Writes the result to global memory
|
||||
#pragma unroll
|
||||
for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
|
||||
dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Triple matrix-multiplication kernel: C = A * B
|
||||
inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
|
||||
__global const real* agm, __global const real* bgm, __global real* cgm,
|
||||
const int lda, const int ldb, const int ldc,
|
||||
int current_size, int num_pages, const int block_size) {
|
||||
|
||||
// Emulates a 3D grid: NX * (NY * num_pages)
|
||||
const int by = get_group_id(1) / num_pages;
|
||||
const int page = get_group_id(1) % num_pages;
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1));
|
||||
const int iby = by*16;
|
||||
const int id = lidx + lidy*get_local_size(0);
|
||||
const int row = page*current_size*2 + current_size + ibx + id;
|
||||
int col = page*current_size*2 + current_size;
|
||||
|
||||
// Sets the offsets for this specific thread
|
||||
agm += ibx + id;
|
||||
bgm += lidx + (iby + lidy)*ldb;
|
||||
cgm += ibx + id + iby*ldc;
|
||||
|
||||
// Initializes the result registers
|
||||
real cpm[16];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
SetToZero(cpm[j]);
|
||||
}
|
||||
|
||||
// Computes NT x 16 block of C, each thread computes one 1 x 16 row
|
||||
for (int k = 0; k < current_size; k += 16) {
|
||||
|
||||
// Loads a 16 x 16 block of B into local memory using NX x 4 threads
|
||||
#pragma unroll
|
||||
for( int i=0; i < 16; i += (size/4) ) { // += get_local_size(0)
|
||||
#pragma unroll
|
||||
for( int j=0; j < 16; j += 4 ) { // += get_local_size(1)
|
||||
blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Upper triangular
|
||||
if (upper) {
|
||||
|
||||
// Performs 16 x 16 multiply-add operations
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
if (part == 2 || col++ < n) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Lower triangular
|
||||
else {
|
||||
if (row < n) {
|
||||
|
||||
// Performs 16 x 16 multiply-add operations
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// Stores NT x 16 results: each thread writes one 16 x 1 row
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
if (part == 2) { Negate(cpm[i]); }
|
||||
cgm[0] = cpm[i];
|
||||
cgm += ldc;
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
|
||||
inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
|
||||
__global const real* src, const int a_offset, const int lda,
|
||||
__global real* dest, int current_size, int num_pages, const int block_size) {
|
||||
|
||||
// Emulates a 3D grid: NX * (NY * num_pages)
|
||||
const int page = get_group_id(1) % num_pages;
|
||||
|
||||
// Computes the destination block offset:
|
||||
// - go to the (page / pages_per_block) outer block_size * block_size block
|
||||
// - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
|
||||
const int pages_per_block = block_size / (current_size*2);
|
||||
dest += (page / pages_per_block) * block_size * block_size +
|
||||
(page % pages_per_block) * (current_size*2*block_size + current_size*2);
|
||||
|
||||
// Using the GEMM notation: C = A*B
|
||||
__global const real* agm;
|
||||
__global const real* bgm;
|
||||
__global real* cgm;
|
||||
if (upper) { // upper triangular: B12 = A12 * B22
|
||||
agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda; // A12
|
||||
bgm = dest + current_size*block_size + current_size; // B22
|
||||
cgm = dest + current_size*block_size; // B12
|
||||
}
|
||||
else { // lower triangular: B21 = A21 * B11
|
||||
agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size; // A21
|
||||
bgm = dest; // B11
|
||||
cgm = dest + current_size; // B21
|
||||
}
|
||||
|
||||
// Runs the generic C = A * B matrix multiplication
|
||||
const int ldb = block_size;
|
||||
const int ldc = block_size;
|
||||
TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
|
||||
inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
|
||||
__global real* dest, int current_size, int num_pages, const int block_size) {
|
||||
|
||||
// Emulates a 3D grid: NX * (NY * num_pages)
|
||||
const int page = get_group_id(1) % num_pages;
|
||||
|
||||
// Computes the destination block offset:
|
||||
// - go to the (page / pages_per_block) outer block_size * block_size block
|
||||
// - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
|
||||
const int pages_per_block = block_size / (current_size*2);
|
||||
dest += (page / pages_per_block) * block_size * block_size +
|
||||
(page % pages_per_block) * (current_size*2*block_size + current_size*2);
|
||||
|
||||
// Using the GEMM notation: C = A*B
|
||||
__global const real* agm;
|
||||
__global const real* bgm;
|
||||
__global real* cgm;
|
||||
if (upper) { // upper triangular: B12 = -B11 * B12
|
||||
agm = dest; // B11
|
||||
cgm = dest + current_size*block_size; // B12
|
||||
bgm = cgm; // B12, okay to overwrite
|
||||
}
|
||||
|
||||
else { // lower triangular: B21 = -B22 * B21
|
||||
agm = dest + current_size*block_size + current_size; // B22
|
||||
cgm = dest + current_size; // B21
|
||||
bgm = cgm; // B21, okay to overwrite
|
||||
}
|
||||
|
||||
// Runs the generic C = A * B matrix multiplication
|
||||
const int lda = block_size;
|
||||
const int ldb = block_size;
|
||||
const int ldc = block_size;
|
||||
TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// B21 = A21 * B11
|
||||
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
|
||||
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// B21 = -B22 * B21
|
||||
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
|
||||
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// B21 = A21 * B11
|
||||
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
|
||||
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// B21 = -B22 * B21
|
||||
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
|
||||
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// B21 = A21 * B11
|
||||
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
|
||||
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// B21 = -B22 * B21
|
||||
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
|
||||
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// B12 = A12 * B22
|
||||
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
|
||||
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// B12 = -B11 * B12
|
||||
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
|
||||
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// B12 = A12 * B22
|
||||
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
|
||||
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// B12 = -B11 * B12
|
||||
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
|
||||
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// B12 = A12 * B22
|
||||
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
|
||||
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
// B12 = -B11 * B12
|
||||
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
|
||||
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
|
||||
}
|
||||
|
||||
#endif
|
||||
// =================================================================================================
|
||||
|
||||
// End of the C++11 raw string literal
|
||||
)"
|
||||
|
||||
// =================================================================================================
|
|
@ -73,6 +73,22 @@ R"(
|
|||
#define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts
|
||||
#endif
|
||||
|
||||
// =================================================================================================
|
||||
#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
|
||||
void FillMatrix(const int m, const int n, const int ld, const int offset,
|
||||
__global real* restrict dest, const real_arg arg_value) {
|
||||
const real value = GetRealArg(arg_value);
|
||||
const int id_one = get_global_id(0);
|
||||
const int id_two = get_global_id(1);
|
||||
if (id_one < m && id_two < n) {
|
||||
dest[id_two*ld + id_one + offset] = value;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// End of the C++11 raw string literal
|
||||
|
|
|
@ -25,16 +25,17 @@ namespace clblast {
|
|||
const std::vector<std::string> Routine::routines_axpy = {"AXPY", "COPY", "SCAL", "SWAP"};
|
||||
const std::vector<std::string> Routine::routines_dot = {"AMAX", "ASUM", "DOT", "DOTC", "DOTU", "MAX", "MIN", "NRM2", "SUM"};
|
||||
const std::vector<std::string> Routine::routines_ger = {"GER", "GERC", "GERU", "HER", "HER2", "HPR", "HPR2", "SPR", "SPR2", "SYR", "SYR2"};
|
||||
const std::vector<std::string> Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV"};
|
||||
const std::vector<std::string> Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV", "TRSV"};
|
||||
const std::vector<std::string> Routine::routines_gemm = {"GEMM", "HEMM", "SYMM", "TRMM"};
|
||||
const std::vector<std::string> Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM"};
|
||||
const std::vector<std::string> Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM", "TRSM"};
|
||||
const std::vector<std::string> Routine::routines_trsm = {"TRSM"};
|
||||
const std::unordered_map<std::string, const std::vector<std::string>> Routine::routines_by_kernel = {
|
||||
{"Xaxpy", routines_axpy},
|
||||
{"Xdot", routines_dot},
|
||||
{"Xgemv", routines_gemv},
|
||||
{"XgemvFast", routines_gemv},
|
||||
{"XgemvFastRot", routines_gemv},
|
||||
{"Xgemv", {}},
|
||||
{"Xtrsv", routines_gemv},
|
||||
{"Xger", routines_ger},
|
||||
{"Copy", routines_gemm_syrk},
|
||||
{"Pad", routines_gemm_syrk},
|
||||
|
@ -43,6 +44,7 @@ const std::unordered_map<std::string, const std::vector<std::string>> Routine::r
|
|||
{"Xgemm", routines_gemm_syrk},
|
||||
{"XgemmDirect", routines_gemm},
|
||||
{"KernelSelection", routines_gemm},
|
||||
{"Invert", routines_trsm},
|
||||
};
|
||||
// =================================================================================================
|
||||
|
||||
|
|
|
@ -50,6 +50,7 @@ class Routine {
|
|||
static const std::vector<std::string> routines_gemv;
|
||||
static const std::vector<std::string> routines_gemm;
|
||||
static const std::vector<std::string> routines_gemm_syrk;
|
||||
static const std::vector<std::string> routines_trsm;
|
||||
static const std::unordered_map<std::string, const std::vector<std::string>> routines_by_kernel;
|
||||
|
||||
private:
|
||||
|
|
|
@ -33,6 +33,47 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
// Sets all elements of a matrix to a constant value
|
||||
template <typename T>
|
||||
void FillMatrix(Queue &queue, const Device &device,
|
||||
const Program &program, const Databases &,
|
||||
EventPointer event, const std::vector<Event> &waitForEvents,
|
||||
const size_t m, const size_t n, const size_t ld, const size_t offset,
|
||||
const Buffer<T> &dest,
|
||||
const T constant_value) {
|
||||
auto kernel = Kernel(program, "FillMatrix");
|
||||
kernel.SetArgument(0, static_cast<int>(m));
|
||||
kernel.SetArgument(1, static_cast<int>(n));
|
||||
kernel.SetArgument(2, static_cast<int>(ld));
|
||||
kernel.SetArgument(3, static_cast<int>(offset));
|
||||
kernel.SetArgument(4, dest());
|
||||
kernel.SetArgument(5, GetRealArg(constant_value));
|
||||
auto local = std::vector<size_t>{8, 8};
|
||||
auto global = std::vector<size_t>{Ceil(m, 8), Ceil(n, 8)};
|
||||
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
|
||||
}
|
||||
|
||||
// Sets all elements of a vector to a constant value
|
||||
template <typename T>
|
||||
void FillVector(Queue &queue, const Device &device,
|
||||
const Program &program, const Databases &,
|
||||
EventPointer event, const std::vector<Event> &waitForEvents,
|
||||
const size_t n, const size_t inc, const size_t offset,
|
||||
const Buffer<T> &dest,
|
||||
const T constant_value) {
|
||||
auto kernel = Kernel(program, "FillVector");
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, static_cast<int>(inc));
|
||||
kernel.SetArgument(2, static_cast<int>(offset));
|
||||
kernel.SetArgument(3, dest());
|
||||
kernel.SetArgument(4, GetRealArg(constant_value));
|
||||
auto local = std::vector<size_t>{64};
|
||||
auto global = std::vector<size_t>{Ceil(n, 64)};
|
||||
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
|
||||
// to write to symmetric and triangular matrices through optional arguments.
|
||||
template <typename T>
|
||||
|
|
|
@ -22,9 +22,10 @@ namespace clblast {
|
|||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
|
||||
Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot", "Xtrsv"}, PrecisionValue<T>(), {}, {
|
||||
#include "../../kernels/level2/xgemv.opencl"
|
||||
#include "../../kernels/level2/xgemv_fast.opencl"
|
||||
#include "../../kernels/level2/xtrsv.opencl"
|
||||
}) {
|
||||
}
|
||||
|
||||
|
|
161
src/routines/level2/xtrsv.cpp
Normal file
161
src/routines/level2/xtrsv.cpp
Normal file
|
@ -0,0 +1,161 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Xtrsv class (see the header for information about the class).
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "routines/level2/xtrsv.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
|
||||
Xgemv<T>(queue, event, name) {
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
template <typename T>
|
||||
void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
|
||||
const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
|
||||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
|
||||
|
||||
if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };
|
||||
|
||||
// Translates CLBlast arguments to 0/1 integers for the OpenCL kernel
|
||||
const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
|
||||
const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
|
||||
(a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
|
||||
const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0;
|
||||
|
||||
// The data is either in the upper or lower triangle
|
||||
const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
|
||||
(triangle == Triangle::kLower && a_transpose != Transpose::kNo));
|
||||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
|
||||
auto kernel = Kernel(program_, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, a_buffer());
|
||||
kernel.SetArgument(2, static_cast<int>(a_offset));
|
||||
kernel.SetArgument(3, static_cast<int>(a_ld));
|
||||
kernel.SetArgument(4, b_buffer());
|
||||
kernel.SetArgument(5, static_cast<int>(b_offset));
|
||||
kernel.SetArgument(6, static_cast<int>(b_inc));
|
||||
kernel.SetArgument(7, x_buffer());
|
||||
kernel.SetArgument(8, static_cast<int>(x_offset));
|
||||
kernel.SetArgument(9, static_cast<int>(x_inc));
|
||||
kernel.SetArgument(10, static_cast<int>(is_transposed));
|
||||
kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
|
||||
kernel.SetArgument(12, static_cast<int>(do_conjugate));
|
||||
|
||||
// Launches the kernel
|
||||
const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
|
||||
const auto global = std::vector<size_t>{1};
|
||||
auto event = Event();
|
||||
RunKernel(kernel, queue_, device_, global, local, event.pointer());
|
||||
event.WaitForCompletion();
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// The main routine
|
||||
template <typename T>
|
||||
void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
|
||||
const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) {
|
||||
|
||||
// Makes sure all dimensions are larger than zero
|
||||
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
|
||||
|
||||
// Tests the matrix and vector
|
||||
TestMatrixA(n, n, a_buffer, a_offset, a_ld);
|
||||
TestVectorX(n, b_buffer, b_offset, b_inc);
|
||||
|
||||
// Creates a copy of B to avoid overwriting input while computing output
|
||||
// TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels
|
||||
const auto x_offset = b_offset;
|
||||
const auto x_inc = b_inc;
|
||||
const auto x_size = n*x_inc + x_offset;
|
||||
auto x_buffer = Buffer<T>(context_, x_size);
|
||||
b_buffer.CopyTo(queue_, x_size, x_buffer);
|
||||
|
||||
// Fills the output buffer with zeros
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto fill_vector_event = Event();
|
||||
FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList,
|
||||
n, x_inc, x_offset, x_buffer, ConstantZero<T>());
|
||||
fill_vector_event.WaitForCompletion();
|
||||
|
||||
// Derives properties based on the arguments
|
||||
const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
|
||||
(triangle == Triangle::kLower && a_transpose != Transpose::kNo));
|
||||
const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) ||
|
||||
(layout != Layout::kColMajor && a_transpose != Transpose::kNo));
|
||||
|
||||
// Loops over the blocks
|
||||
auto col = n; // the initial column position
|
||||
for (auto i = size_t{0}; i < n; i += db_["TRSV_BLOCK_SIZE"]) {
|
||||
const auto block_size = std::min(db_["TRSV_BLOCK_SIZE"], n - i);
|
||||
|
||||
// Sets the next column position
|
||||
col = (is_upper) ? col - block_size : i;
|
||||
|
||||
// Sets the offsets for upper or lower triangular
|
||||
const auto extra_offset_a = (is_transposed) ?
|
||||
(is_upper ? col + (col+block_size)*a_ld : col) :
|
||||
(is_upper ? col+block_size + col*a_ld : col*a_ld);
|
||||
const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0;
|
||||
const auto extra_offset_b = col*x_inc;
|
||||
|
||||
// Runs the GEMV routine to compute x' = A * x
|
||||
if (i > 0) {
|
||||
const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i;
|
||||
const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size;
|
||||
DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
|
||||
a_buffer, a_offset + extra_offset_a, a_ld,
|
||||
x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
|
||||
x_buffer, x_offset + extra_offset_b, x_inc );
|
||||
}
|
||||
|
||||
// Runs the triangular substitution for the block size
|
||||
Substitution(layout, triangle, a_transpose, diagonal, block_size,
|
||||
a_buffer, a_offset + col + col*a_ld, a_ld,
|
||||
b_buffer, b_offset + col*b_inc, b_inc,
|
||||
x_buffer, x_offset + col*x_inc, x_inc);
|
||||
}
|
||||
|
||||
// Retrieves the results
|
||||
x_buffer.CopyTo(queue_, x_size, b_buffer);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xtrsv<half>;
|
||||
template class Xtrsv<float>;
|
||||
template class Xtrsv<double>;
|
||||
template class Xtrsv<float2>;
|
||||
template class Xtrsv<double2>;
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
60
src/routines/level2/xtrsv.hpp
Normal file
60
src/routines/level2/xtrsv.hpp
Normal file
|
@ -0,0 +1,60 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular
|
||||
// forward and backward substitutions on the diagonal parts of the matrix in combination with larger
|
||||
// GEMV computation on the remainder of the matrix.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_ROUTINES_XTRSV_H_
|
||||
#define CLBLAST_ROUTINES_XTRSV_H_
|
||||
|
||||
#include "routines/level2/xgemv.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xtrsv: public Xgemv<T> {
|
||||
public:
|
||||
|
||||
// Uses the generic matrix-vector routine
|
||||
using Xgemv<T>::queue_;
|
||||
using Xgemv<T>::context_;
|
||||
using Xgemv<T>::device_;
|
||||
using Xgemv<T>::db_;
|
||||
using Xgemv<T>::program_;
|
||||
using Xgemv<T>::DoGemv;
|
||||
|
||||
// Constructor
|
||||
Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV");
|
||||
|
||||
// Templated-precision implementation of the routine
|
||||
void DoTrsv(const Layout layout, const Triangle triangle,
|
||||
const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
|
||||
|
||||
// Performs forward or backward substitution on a small triangular matrix
|
||||
void Substitution(const Layout layout, const Triangle triangle,
|
||||
const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
|
||||
const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_ROUTINES_XTRSV_H_
|
||||
#endif
|
|
@ -33,10 +33,11 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
|
|||
#include "../../kernels/level3/convert_symmetric.opencl"
|
||||
#include "../../kernels/level3/convert_triangular.opencl"
|
||||
#include "../../kernels/level3/convert_hermitian.opencl"
|
||||
, // separated in multiple parts to prevent C1091 in MSVC 2013
|
||||
#include "../../kernels/level3/xgemm_direct_part1.opencl"
|
||||
#include "../../kernels/level3/xgemm_direct_part2.opencl"
|
||||
#include "../../kernels/level3/xgemm_direct_part3.opencl"
|
||||
, // separated in two parts to prevent C1091 in MSVC 2013
|
||||
, // separated in multiple parts to prevent C1091 in MSVC 2013
|
||||
#include "../../kernels/level3/xgemm_part1.opencl"
|
||||
#include "../../kernels/level3/xgemm_part2.opencl"
|
||||
#include "../../kernels/level3/xgemm_part3.opencl"
|
||||
|
|
227
src/routines/level3/xtrsm.cpp
Normal file
227
src/routines/level3/xtrsm.cpp
Normal file
|
@ -0,0 +1,227 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based
|
||||
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
|
||||
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
|
||||
// and Jack Dongarra and the OpenCL implementation in clBLAS.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "routines/level3/xtrsm.hpp"
|
||||
#include "routines/levelx/xinvert.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name):
|
||||
Xgemm<T>(queue, event, name) {
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// The entry point: transforming into col-major (if needed) and then running the col-major version
|
||||
template <typename T>
|
||||
void Xtrsm<T>::DoTrsm(const Layout layout, Side side, Triangle triangle,
|
||||
const Transpose a_transpose, const Diagonal diagonal,
|
||||
size_t m, size_t n,
|
||||
const T alpha,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
|
||||
|
||||
// Converts row-major to a col-major problem:
|
||||
// The idea is that
|
||||
// B = A*X
|
||||
// can be computed as
|
||||
// B' = (A*X)' = X'*A'
|
||||
// Since changing the order is basically a transpose on each matrix, the formula becomes:
|
||||
// B = X*A
|
||||
// So only the side (left/right) and the triangle (upper/lower) are changed and M/N are swapped
|
||||
if (layout == Layout::kRowMajor) {
|
||||
std::swap(m, n);
|
||||
side = (side == Side::kLeft) ? Side::kRight : Side::kLeft;
|
||||
triangle = (triangle == Triangle::kLower) ? Triangle::kUpper : Triangle::kLower;
|
||||
}
|
||||
|
||||
// Runs the col-major version of TRSM
|
||||
TrsmColMajor(side, triangle, a_transpose, diagonal,
|
||||
m, n, alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
b_buffer, b_offset, b_ld);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// The main routine
|
||||
template <typename T>
|
||||
void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
|
||||
const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const T alpha,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
|
||||
|
||||
// Settings
|
||||
constexpr auto block_size = size_t{32}; // tuneable
|
||||
|
||||
// Makes sure all dimensions are larger than zero
|
||||
if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
|
||||
|
||||
// Computes the k dimension. This is based on whether or not matrix is A (on the left)
|
||||
// or B (on the right) in the Xgemm routine.
|
||||
const auto k = (side == Side::kLeft) ? m : n;
|
||||
|
||||
// Checks for validity of the triangular A matrix
|
||||
TestMatrixA(k, k, a_buffer, a_offset, a_ld);
|
||||
|
||||
// Checks for validity of the input B matrix
|
||||
TestMatrixB(m, n, b_buffer, b_offset, b_ld);
|
||||
|
||||
// Creates a copy of B to avoid overwriting input in GEMM while computing output
|
||||
const auto b_size = b_ld * (n - 1) + m + b_offset;
|
||||
const auto x_one = m;
|
||||
const auto x_two = n;
|
||||
const auto x_size = b_size;
|
||||
const auto x_ld = b_ld;
|
||||
const auto x_offset = b_offset;
|
||||
auto x_buffer = Buffer<T>(context_, x_size);
|
||||
b_buffer.CopyTo(queue_, x_size, x_buffer);
|
||||
|
||||
// Temporary buffer for the inverse of the A matrix
|
||||
const auto a_inv_size = Ceil(k, block_size) * block_size;
|
||||
auto a_inv_buffer = Buffer<T>(context_, a_inv_size);
|
||||
|
||||
// Fills the output buffer with zeros
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto fill_matrix_event = Event();
|
||||
FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList,
|
||||
x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero<T>());
|
||||
fill_matrix_event.WaitForCompletion();
|
||||
|
||||
// Inverts the diagonal blocks
|
||||
auto diagonal_invert_event = Event();
|
||||
auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer());
|
||||
inverter.InvertMatrixDiagonalBlocks(Layout::kColMajor, triangle, diagonal,
|
||||
k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer);
|
||||
diagonal_invert_event.WaitForCompletion();
|
||||
|
||||
// Derives properties based on the arguments
|
||||
const auto condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) ||
|
||||
(triangle == Triangle::kLower && a_transpose == Transpose::kNo));
|
||||
|
||||
// Left side
|
||||
if (side == Side::kLeft) {
|
||||
|
||||
// True when (lower triangular) or (upper triangular and transposed)
|
||||
if (condition) {
|
||||
for (auto i = size_t{0}; i < m; i += block_size) {
|
||||
const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
|
||||
const auto current_block_size = std::min(m - i, block_size);
|
||||
DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
|
||||
current_block_size, n, current_block_size, gemm_alpha,
|
||||
a_inv_buffer, i * block_size, block_size,
|
||||
b_buffer, i, b_ld, ConstantZero<T>(),
|
||||
x_buffer, i, x_ld);
|
||||
if (i + block_size >= m) { break; }
|
||||
const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld;
|
||||
DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
|
||||
m - i - block_size, n, block_size, ConstantNegOne<T>(),
|
||||
a_buffer, this_a_offset, a_ld,
|
||||
x_buffer, i, x_ld, ConstantOne<T>(),
|
||||
b_buffer, i + block_size, b_ld);
|
||||
}
|
||||
}
|
||||
|
||||
// True when (upper triangular) or (lower triangular and transposed)
|
||||
else {
|
||||
const auto current_block_size = (m % block_size == 0) ? block_size : (m % block_size);
|
||||
const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size);
|
||||
for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
|
||||
const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
|
||||
DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
|
||||
current_block_size, n, current_block_size, gemm_alpha,
|
||||
a_inv_buffer, i * block_size, block_size,
|
||||
b_buffer, i, b_ld, ConstantZero<T>(),
|
||||
x_buffer, i, x_ld);
|
||||
if (i - static_cast<int>(block_size) < 0) { break; }
|
||||
const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i;
|
||||
DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
|
||||
i, n, block_size, ConstantNegOne<T>(),
|
||||
a_buffer, this_a_offset, a_ld,
|
||||
x_buffer, i, x_ld, ConstantOne<T>(),
|
||||
b_buffer, 0, b_ld);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Right side
|
||||
else {
|
||||
|
||||
// True when (lower triangular) or (upper triangular and transposed)
|
||||
if (condition) {
|
||||
const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size);
|
||||
const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size);
|
||||
for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
|
||||
const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
|
||||
DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
|
||||
m, current_block_size, current_block_size, gemm_alpha,
|
||||
b_buffer, i * b_ld, b_ld,
|
||||
a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
|
||||
x_buffer, i * x_ld, x_ld);
|
||||
if (i - static_cast<int>(block_size) < 0) { break; }
|
||||
const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
|
||||
DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
|
||||
m, i, current_block_size, ConstantNegOne<T>(),
|
||||
x_buffer, i * x_ld, x_ld,
|
||||
a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
|
||||
b_buffer, 0, b_ld);
|
||||
}
|
||||
}
|
||||
|
||||
// True when (upper triangular) or (lower triangular and transposed)
|
||||
else {
|
||||
for (auto i = size_t{0}; i < n; i += block_size) {
|
||||
const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
|
||||
const auto current_block_size = std::min(n - i, block_size);
|
||||
DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
|
||||
m, current_block_size, current_block_size, gemm_alpha,
|
||||
b_buffer, i * b_ld, b_ld,
|
||||
a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
|
||||
x_buffer, i * x_ld, x_ld);
|
||||
if (i + block_size >= n) { break; }
|
||||
const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld;
|
||||
DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
|
||||
m, n - i - block_size, block_size, ConstantNegOne<T>(),
|
||||
x_buffer, i * x_ld, x_ld,
|
||||
a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
|
||||
b_buffer, (i + block_size) * b_ld, b_ld);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Retrieves the results
|
||||
x_buffer.CopyTo(queue_, b_size, b_buffer);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xtrsm<half>;
|
||||
template class Xtrsm<float>;
|
||||
template class Xtrsm<double>;
|
||||
template class Xtrsm<float2>;
|
||||
template class Xtrsm<double2>;
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
60
src/routines/level3/xtrsm.hpp
Normal file
60
src/routines/level3/xtrsm.hpp
Normal file
|
@ -0,0 +1,60 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the Xtrsm routine. The implementation is based on ??? (TODO).
|
||||
// Therefore, this class inherits from the Xgemm class.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_ROUTINES_XTRSM_H_
|
||||
#define CLBLAST_ROUTINES_XTRSM_H_
|
||||
|
||||
#include "routines/level3/xgemm.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class Xtrsm: public Xgemm<T> {
|
||||
public:
|
||||
|
||||
// Uses methods and variables the Xgemm routine
|
||||
using Xgemm<T>::queue_;
|
||||
using Xgemm<T>::context_;
|
||||
using Xgemm<T>::device_;
|
||||
using Xgemm<T>::db_;
|
||||
using Xgemm<T>::program_;
|
||||
using Xgemm<T>::DoGemm;
|
||||
|
||||
// Constructor
|
||||
Xtrsm(Queue &queue, EventPointer event, const std::string &name = "TRSM");
|
||||
|
||||
// Templated-precision implementation of the routine
|
||||
void DoTrsm(const Layout layout, Side side, Triangle triangle,
|
||||
const Transpose a_transpose, const Diagonal diagonal,
|
||||
size_t m, size_t n,
|
||||
const T alpha,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
|
||||
|
||||
// Implementation of the column-major version
|
||||
void TrsmColMajor(const Side side, const Triangle triangle,
|
||||
const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const T alpha,
|
||||
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_ROUTINES_XTRSM_H_
|
||||
#endif
|
151
src/routines/levelx/xinvert.cpp
Normal file
151
src/routines/levelx/xinvert.cpp
Normal file
|
@ -0,0 +1,151 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file contains all the common code to perform (partial) matrix inverting. This code is based
|
||||
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
|
||||
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
|
||||
// and Jack Dongarra.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "routines/levelx/xinvert.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <assert.h>
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Constructor: forwards to base class constructor
|
||||
template <typename T>
|
||||
Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
|
||||
Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
|
||||
#include "../../kernels/level3/level3.opencl"
|
||||
#include "../../kernels/level3/invert_diagonal_blocks.opencl"
|
||||
}) {
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Inverts diagonal square blocks of a matrix
|
||||
template <typename T>
|
||||
void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
|
||||
const size_t n, const size_t block_size,
|
||||
const Buffer<T> &src, const size_t offset, const size_t ld_src,
|
||||
Buffer<T> &dest) {
|
||||
|
||||
// Makes sure all dimensions are larger than zero
|
||||
if ((block_size == 0) || (n == 0)) {
|
||||
throw BLASError(StatusCode::kInvalidDimension);
|
||||
}
|
||||
|
||||
// Helper variables
|
||||
const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
|
||||
assert(internal_block_size == 16);
|
||||
const auto num_blocks = CeilDiv(n, block_size);
|
||||
const auto num_internal_blocks = CeilDiv(n, internal_block_size);
|
||||
const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
|
||||
|
||||
// This routine only supports block sizes which are a multiple of the internal block size and
|
||||
// block sizes up to and including 128
|
||||
if ((block_size % internal_block_size != 0) || (block_size > 128)) {
|
||||
throw BLASError(StatusCode::kUnknownError);
|
||||
}
|
||||
|
||||
// Checks for validity of the source and destination matrices
|
||||
TestMatrixA(n, n, src, offset, ld_src);
|
||||
TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size);
|
||||
|
||||
// Determines which kernels to run based on the layout (the kernels assume column-major as
|
||||
// default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
|
||||
const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
|
||||
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
|
||||
const auto name_postfix = (is_upper) ? "Upper" : "Lower";
|
||||
|
||||
// Fills the output buffer with zeros
|
||||
auto event_wait_list = std::vector<Event>();
|
||||
auto fill_matrix_event = Event();
|
||||
FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list,
|
||||
block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
|
||||
event_wait_list.push_back(fill_matrix_event);
|
||||
|
||||
// Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
|
||||
auto kernel = Kernel(program_, "InvertDiagonalBlock");
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, src());
|
||||
kernel.SetArgument(2, static_cast<int>(offset));
|
||||
kernel.SetArgument(3, static_cast<int>(ld_src));
|
||||
kernel.SetArgument(4, dest());
|
||||
kernel.SetArgument(5, static_cast<int>(block_size));
|
||||
kernel.SetArgument(6, static_cast<int>(unit_diagonal));
|
||||
kernel.SetArgument(7, static_cast<int>(is_upper));
|
||||
const auto local = std::vector<size_t>{internal_block_size};
|
||||
const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
|
||||
auto base_kernel_event = Event();
|
||||
auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
|
||||
RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
|
||||
if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
|
||||
|
||||
// Builds up block_size x block_size blocks. For example, internal_block_size=16:
|
||||
// use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads;
|
||||
// then 32 x 32 blocks to build 64 x 64 blocks, 1 x (2 x npages) grid, 8 x 4 threads;
|
||||
// then 64 x 64 blocks to build 128 x 128 blocks, 1 x (4 x npages) grid, 16 x 4 threads;
|
||||
for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) {
|
||||
assert(current_size == 16 || current_size == 32 || current_size == 64);
|
||||
|
||||
// Emulates a 3D grid: NX * (NY * npages)
|
||||
const auto npages = CeilDiv(n, current_size*2);
|
||||
const auto local0 = (current_size <= 32) ? current_size/4 : 16;
|
||||
const auto local = std::vector<size_t>{local0, 4};
|
||||
const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
|
||||
|
||||
// Part 1
|
||||
auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
|
||||
kernel1.SetArgument(0, static_cast<int>(n));
|
||||
kernel1.SetArgument(1, src());
|
||||
kernel1.SetArgument(2, static_cast<int>(offset));
|
||||
kernel1.SetArgument(3, static_cast<int>(ld_src));
|
||||
kernel1.SetArgument(4, dest());
|
||||
kernel1.SetArgument(5, static_cast<int>(current_size));
|
||||
kernel1.SetArgument(6, static_cast<int>(npages));
|
||||
kernel1.SetArgument(7, static_cast<int>(block_size));
|
||||
auto kernel1_event = Event();
|
||||
RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list);
|
||||
event_wait_list.push_back(kernel1_event);
|
||||
|
||||
// Part 2
|
||||
const bool is_last_kernel = (current_size * 2 >= block_size);
|
||||
auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
|
||||
kernel2.SetArgument(0, static_cast<int>(n));
|
||||
kernel2.SetArgument(1, dest());
|
||||
kernel2.SetArgument(2, static_cast<int>(current_size));
|
||||
kernel2.SetArgument(3, static_cast<int>(npages));
|
||||
kernel2.SetArgument(4, static_cast<int>(block_size));
|
||||
auto kernel2_event = Event();
|
||||
auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
|
||||
RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list);
|
||||
if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
|
||||
|
||||
// Exit in case we reach beyond the bounds of the input matrix
|
||||
if (current_size*2 >= n) { break; }
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xinvert<half>;
|
||||
template class Xinvert<float>;
|
||||
template class Xinvert<double>;
|
||||
template class Xinvert<float2>;
|
||||
template class Xinvert<double2>;
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
40
src/routines/levelx/xinvert.hpp
Normal file
40
src/routines/levelx/xinvert.hpp
Normal file
|
@ -0,0 +1,40 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file contains all the common code to perform (partial) matrix inverting.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_ROUTINES_XINVERT_H_
|
||||
#define CLBLAST_ROUTINES_XINVERT_H_
|
||||
|
||||
#include "routine.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
template <typename T>
|
||||
class Xinvert: public Routine {
|
||||
public:
|
||||
|
||||
// Constructor
|
||||
Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT");
|
||||
|
||||
// Inverts diagonal square blocks of a matrix
|
||||
void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
|
||||
const size_t n, const size_t block_size,
|
||||
const Buffer<T> &src, const size_t offset, const size_t ld_src,
|
||||
Buffer<T> &dest);
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_ROUTINES_XINVERT_H_
|
||||
#endif
|
|
@ -18,57 +18,74 @@
|
|||
#include <chrono>
|
||||
#include <random>
|
||||
#include <iomanip>
|
||||
#include <cmath>
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Returns a scalar with a default value
|
||||
template <typename T>
|
||||
T GetScalar() {
|
||||
return static_cast<T>(2.0);
|
||||
}
|
||||
template <typename T> T GetScalar() { return static_cast<T>(2.0); }
|
||||
template float GetScalar<float>();
|
||||
template double GetScalar<double>();
|
||||
template <> half GetScalar() { return FloatToHalf(2.0f); }
|
||||
template <> float2 GetScalar() { return {2.0f, 0.5f}; }
|
||||
template <> double2 GetScalar() { return {2.0, 0.5}; }
|
||||
|
||||
// Specialized version of the above for half-precision
|
||||
template <>
|
||||
half GetScalar() {
|
||||
return FloatToHalf(2.0f);
|
||||
}
|
||||
|
||||
// Specialized versions of the above for complex data-types
|
||||
template <>
|
||||
float2 GetScalar() {
|
||||
return {2.0f, 0.5f};
|
||||
}
|
||||
template <>
|
||||
double2 GetScalar() {
|
||||
return {2.0, 0.5};
|
||||
}
|
||||
// Returns a scalar of value 0
|
||||
template <typename T> T ConstantZero() { return static_cast<T>(0.0); }
|
||||
template float ConstantZero<float>();
|
||||
template double ConstantZero<double>();
|
||||
template <> half ConstantZero() { return FloatToHalf(0.0f); }
|
||||
template <> float2 ConstantZero() { return {0.0f, 0.0f}; }
|
||||
template <> double2 ConstantZero() { return {0.0, 0.0}; }
|
||||
|
||||
// Returns a scalar of value 1
|
||||
template <typename T>
|
||||
T ConstantOne() {
|
||||
return static_cast<T>(1.0);
|
||||
}
|
||||
template <typename T> T ConstantOne() { return static_cast<T>(1.0); }
|
||||
template float ConstantOne<float>();
|
||||
template double ConstantOne<double>();
|
||||
template <> half ConstantOne() { return FloatToHalf(1.0f); }
|
||||
template <> float2 ConstantOne() { return {1.0f, 0.0f}; }
|
||||
template <> double2 ConstantOne() { return {1.0, 0.0}; }
|
||||
|
||||
// Specialized version of the above for half-precision
|
||||
template <>
|
||||
half ConstantOne() {
|
||||
return FloatToHalf(1.0f);
|
||||
}
|
||||
// Returns a scalar of value -1
|
||||
template <typename T> T ConstantNegOne() { return static_cast<T>(-1.0); }
|
||||
template float ConstantNegOne<float>();
|
||||
template double ConstantNegOne<double>();
|
||||
template <> half ConstantNegOne() { return FloatToHalf(-1.0f); }
|
||||
template <> float2 ConstantNegOne() { return {-1.0f, 0.0f}; }
|
||||
template <> double2 ConstantNegOne() { return {-1.0, 0.0}; }
|
||||
|
||||
// Specialized versions of the above for complex data-types
|
||||
template <>
|
||||
float2 ConstantOne() {
|
||||
return {1.0f, 0.0f};
|
||||
}
|
||||
template <>
|
||||
double2 ConstantOne() {
|
||||
return {1.0, 0.0};
|
||||
}
|
||||
// Returns a scalar of value 1
|
||||
template <typename T> T ConstantTwo() { return static_cast<T>(2.0); }
|
||||
template float ConstantTwo<float>();
|
||||
template double ConstantTwo<double>();
|
||||
template <> half ConstantTwo() { return FloatToHalf(2.0f); }
|
||||
template <> float2 ConstantTwo() { return {2.0f, 0.0f}; }
|
||||
template <> double2 ConstantTwo() { return {2.0, 0.0}; }
|
||||
|
||||
// Returns a small scalar value just larger than 0
|
||||
template <typename T> T SmallConstant() { return static_cast<T>(1e7); }
|
||||
template float SmallConstant<float>();
|
||||
template double SmallConstant<double>();
|
||||
template <> half SmallConstant() { return FloatToHalf(1e7); }
|
||||
template <> float2 SmallConstant() { return {1e7, 0.0f}; }
|
||||
template <> double2 SmallConstant() { return {1e7, 0.0}; }
|
||||
|
||||
// Returns the absolute value of a scalar
|
||||
template <typename T> T AbsoluteValue(const T value) { return std::fabs(value); }
|
||||
template float AbsoluteValue<float>(const float);
|
||||
template double AbsoluteValue<double>(const double);
|
||||
template <> half AbsoluteValue(const half value) { return FloatToHalf(std::fabs(HalfToFloat(value))); }
|
||||
template <> float2 AbsoluteValue(const float2 value) { return std::abs(value); }
|
||||
template <> double2 AbsoluteValue(const double2 value) { return std::abs(value); }
|
||||
|
||||
// Returns whether a scalar is close to zero
|
||||
template <typename T> bool IsCloseToZero(const T value) { return (value > -SmallConstant<T>()) && (value < SmallConstant<T>()); }
|
||||
template bool IsCloseToZero<float>(const float);
|
||||
template bool IsCloseToZero<double>(const double);
|
||||
template <> bool IsCloseToZero(const half value) { return IsCloseToZero(HalfToFloat(value)); }
|
||||
template <> bool IsCloseToZero(const float2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
|
||||
template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
@ -79,23 +96,27 @@ std::string ToString(T value) {
|
|||
}
|
||||
template std::string ToString<int>(int value);
|
||||
template std::string ToString<size_t>(size_t value);
|
||||
template std::string ToString<float>(float value);
|
||||
template std::string ToString<double>(double value);
|
||||
template <>
|
||||
std::string ToString(float value) {
|
||||
std::ostringstream result;
|
||||
result << std::fixed << std::setprecision(2) << value;
|
||||
return result.str();
|
||||
}
|
||||
template <>
|
||||
std::string ToString(double value) {
|
||||
std::ostringstream result;
|
||||
result << std::fixed << std::setprecision(2) << value;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
// If not possible directly: special cases for complex data-types
|
||||
template <>
|
||||
std::string ToString(float2 value) {
|
||||
std::ostringstream real, imag;
|
||||
real << std::setprecision(2) << value.real();
|
||||
imag << std::setprecision(2) << value.imag();
|
||||
return real.str()+"+"+imag.str()+"i";
|
||||
return ToString(value.real())+"+"+ToString(value.imag())+"i";
|
||||
}
|
||||
template <>
|
||||
std::string ToString(double2 value) {
|
||||
std::ostringstream real, imag;
|
||||
real << std::setprecision(2) << value.real();
|
||||
imag << std::setprecision(2) << value.imag();
|
||||
return real.str()+"+"+imag.str()+"i";
|
||||
return ToString(value.real())+"+"+ToString(value.imag())+"i";
|
||||
}
|
||||
|
||||
// If not possible directly: special case for half-precision
|
||||
|
|
|
@ -99,12 +99,20 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
|
|||
// =================================================================================================
|
||||
|
||||
// Returns a scalar with a default value
|
||||
template <typename T>
|
||||
T GetScalar();
|
||||
template <typename T> T GetScalar();
|
||||
|
||||
// Returns a scalar of value 1
|
||||
template <typename T>
|
||||
T ConstantOne();
|
||||
// Fixed value scalars
|
||||
template <typename T> T ConstantZero();
|
||||
template <typename T> T ConstantOne();
|
||||
template <typename T> T ConstantNegOne();
|
||||
template <typename T> T ConstantTwo();
|
||||
template <typename T> T SmallConstant();
|
||||
|
||||
// Returns the absolute value of a scalar
|
||||
template <typename T> T AbsoluteValue(const T value);
|
||||
|
||||
// Returns whether a scalar is close to zero
|
||||
template <typename T> bool IsCloseToZero(const T value);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
|
|
@ -23,7 +23,6 @@ int main(int argc, char *argv[]) {
|
|||
errors += clblast::RunTests<clblast::TestXtrsm<double>, double, double>(argc, argv, true, "DTRSM");
|
||||
errors += clblast::RunTests<clblast::TestXtrsm<float2>, float2, float2>(argc, argv, true, "CTRSM");
|
||||
errors += clblast::RunTests<clblast::TestXtrsm<double2>, double2, double2>(argc, argv, true, "ZTRSM");
|
||||
errors += clblast::RunTests<clblast::TestXtrsm<half>, half, half>(argc, argv, true, "HTRSM");
|
||||
if (errors > 0) { return 1; } else { return 0; }
|
||||
}
|
||||
|
||||
|
|
30
test/correctness/routines/levelx/xinvert.cpp
Normal file
30
test/correctness/routines/levelx/xinvert.cpp
Normal file
|
@ -0,0 +1,30 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "test/correctness/testblas.hpp"
|
||||
#include "test/routines/levelx/xinvert.hpp"
|
||||
|
||||
// Shortcuts to the clblast namespace
|
||||
using float2 = clblast::float2;
|
||||
using double2 = clblast::double2;
|
||||
|
||||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
auto errors = size_t{0};
|
||||
errors += clblast::RunTests<clblast::TestXinvert<float>, float, float>(argc, argv, false, "SINVERT");
|
||||
errors += clblast::RunTests<clblast::TestXinvert<double>, double, double>(argc, argv, true, "DINVERT");
|
||||
errors += clblast::RunTests<clblast::TestXinvert<float2>, float2, float2>(argc, argv, true, "CINVERT");
|
||||
errors += clblast::RunTests<clblast::TestXinvert<double2>, double2, double2>(argc, argv, true, "ZINVERT");
|
||||
errors += clblast::RunTests<clblast::TestXinvert<half>, half, half>(argc, argv, true, "HINVERT");
|
||||
if (errors > 0) { return 1; } else { return 0; }
|
||||
}
|
||||
|
||||
// =================================================================================================
|
|
@ -138,7 +138,10 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
|
|||
|
||||
// Don't continue with CBLAS if there are incorrect parameters
|
||||
if (compare_cblas_ && status2 != StatusCode::kSuccess) {
|
||||
if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
|
||||
if (verbose_) {
|
||||
fprintf(stdout, " -> %d -> ", static_cast<int>(status2));
|
||||
std::cout << std::flush;
|
||||
}
|
||||
TestErrorCodes(status2, status2, args);
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -40,6 +40,12 @@ float getAbsoluteErrorMargin<half>() {
|
|||
return 0.10f; // especially small values are inaccurate for half-precision
|
||||
}
|
||||
|
||||
// Error margin: numbers beyond this value are considered equal to inf or NaN
|
||||
template <typename T>
|
||||
T getAlmostInfNumber() {
|
||||
return T{1e35}; // used for correctness testing of TRSV and TRSM routines
|
||||
}
|
||||
|
||||
// Maximum number of test results printed on a single line
|
||||
template <typename T, typename U> const size_t Tester<T,U>::kResultsPerLine = size_t{64};
|
||||
|
||||
|
@ -426,6 +432,21 @@ bool TestSimilarityNear(const T val1, const T val2,
|
|||
if (val1 == val2) {
|
||||
return true;
|
||||
}
|
||||
// Handles cases with both results NaN or inf
|
||||
else if ((std::isnan(val1) && std::isnan(val2)) || (std::isinf(val1) && std::isinf(val2))) {
|
||||
return true;
|
||||
}
|
||||
// Also considers it OK if one of the results in NaN and the other is inf
|
||||
// Note: for TRSV and TRSM routines
|
||||
else if ((std::isnan(val1) && std::isinf(val2)) || (std::isinf(val1) && std::isnan(val2))) {
|
||||
return true;
|
||||
}
|
||||
// Also considers it OK if one of the values is super large and the other is inf or NaN
|
||||
// Note: for TRSV and TRSM routines
|
||||
else if ((std::abs(val1) > getAlmostInfNumber<T>() && (std::isinf(val2) || std::isnan(val2))) ||
|
||||
(std::abs(val2) > getAlmostInfNumber<T>() && (std::isinf(val1) || std::isnan(val1)))) {
|
||||
return true;
|
||||
}
|
||||
// The values are zero or very small: the relative error is less meaningful
|
||||
else if (val1 == 0 || val2 == 0 || difference < error_margin_absolute) {
|
||||
return (difference < error_margin_absolute);
|
||||
|
@ -452,15 +473,21 @@ template bool TestSimilarity<double>(const double, const double);
|
|||
// Specialisations for non-standard data-types
|
||||
template <>
|
||||
bool TestSimilarity(const float2 val1, const float2 val2) {
|
||||
auto real = TestSimilarity(val1.real(), val2.real());
|
||||
auto imag = TestSimilarity(val1.imag(), val2.imag());
|
||||
return (real && imag);
|
||||
const auto real = TestSimilarity(val1.real(), val2.real());
|
||||
const auto imag = TestSimilarity(val1.imag(), val2.imag());
|
||||
if (real && imag) { return true; }
|
||||
// also OK if one is good and the combined is good (indicates a big diff between real & imag)
|
||||
if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); }
|
||||
return false; // neither real nor imag is good, return false
|
||||
}
|
||||
template <>
|
||||
bool TestSimilarity(const double2 val1, const double2 val2) {
|
||||
auto real = TestSimilarity(val1.real(), val2.real());
|
||||
auto imag = TestSimilarity(val1.imag(), val2.imag());
|
||||
return (real && imag);
|
||||
const auto real = TestSimilarity(val1.real(), val2.real());
|
||||
const auto imag = TestSimilarity(val1.imag(), val2.imag());
|
||||
if (real && imag) { return true; }
|
||||
// also OK if one is good and the combined is good (indicates a big diff between real & imag)
|
||||
if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); }
|
||||
return false; // neither real nor imag is good, return false
|
||||
}
|
||||
template <>
|
||||
bool TestSimilarity(const half val1, const half val2) {
|
||||
|
|
|
@ -20,8 +20,7 @@ using double2 = clblast::double2;
|
|||
int main(int argc, char *argv[]) {
|
||||
const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
|
||||
switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
|
||||
case clblast::Precision::kHalf:
|
||||
clblast::RunClient<clblast::TestXtrsm<half>, half, half>(argc, argv); break;
|
||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case clblast::Precision::kSingle:
|
||||
clblast::RunClient<clblast::TestXtrsm<float>, float, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble:
|
||||
|
|
37
test/performance/routines/levelx/xinvert.cpp
Normal file
37
test/performance/routines/levelx/xinvert.cpp
Normal file
|
@ -0,0 +1,37 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include "test/performance/client.hpp"
|
||||
#include "test/routines/levelx/xinvert.hpp"
|
||||
|
||||
// Shortcuts to the clblast namespace
|
||||
using float2 = clblast::float2;
|
||||
using double2 = clblast::double2;
|
||||
|
||||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
|
||||
switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
|
||||
case clblast::Precision::kHalf:
|
||||
clblast::RunClient<clblast::TestXinvert<half>, half, half>(argc, argv); break;
|
||||
case clblast::Precision::kSingle:
|
||||
clblast::RunClient<clblast::TestXinvert<float>, float, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble:
|
||||
clblast::RunClient<clblast::TestXinvert<double>, double, double>(argc, argv); break;
|
||||
case clblast::Precision::kComplexSingle:
|
||||
clblast::RunClient<clblast::TestXinvert<float2>, float2, float2>(argc, argv); break;
|
||||
case clblast::Precision::kComplexDouble:
|
||||
clblast::RunClient<clblast::TestXinvert<double2>, double2, double2>(argc, argv); break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
184
test/routines/level2/xtrsv.hpp
Normal file
184
test/routines/level2/xtrsv.hpp
Normal file
|
@ -0,0 +1,184 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements a class with static methods to describe the Xtrsv routine. Examples of
|
||||
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
|
||||
// static methods are used by the correctness tester and the performance tester.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_TEST_ROUTINES_XTRSV_H_
|
||||
#define CLBLAST_TEST_ROUTINES_XTRSV_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
#include "test/wrapper_clblas.hpp"
|
||||
#endif
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
#include "test/wrapper_cblas.hpp"
|
||||
#endif
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Prepares the data
|
||||
template <typename T>
|
||||
void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
if (args.a_ld < args.n) { return; }
|
||||
if (args.a_size <= 0 || args.x_size <= 0) { return; }
|
||||
|
||||
// Copies input buffers to the host
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
|
||||
// Generates 'proper' input for the TRSV routine
|
||||
// TODO: Improve this, currently loosely based on clBLAS's implementation
|
||||
for (auto i = size_t{0}; i < args.n; ++i) {
|
||||
auto diagonal = a_mat_cpu[i*args.a_ld + i + args.a_offset];
|
||||
diagonal = AbsoluteValue(diagonal) + static_cast<T>(args.n / size_t{4});
|
||||
for (auto j = size_t{0}; j < args.n; ++j) {
|
||||
a_mat_cpu[j*args.a_ld + i + args.a_offset] /= ConstantTwo<T>();
|
||||
}
|
||||
a_mat_cpu[i*args.a_ld + i + args.a_offset] = diagonal;
|
||||
x_vec_cpu[i * args.x_inc + args.x_offset] /= ConstantTwo<T>();
|
||||
}
|
||||
|
||||
// Copies input buffers back to the OpenCL device
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
return;
|
||||
}
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class TestXtrsv {
|
||||
public:
|
||||
|
||||
// The BLAS level: 1, 2, or 3
|
||||
static size_t BLASLevel() { return 2; }
|
||||
|
||||
// The list of arguments relevant for this routine
|
||||
static std::vector<std::string> GetOptions() {
|
||||
return {kArgN,
|
||||
kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
|
||||
kArgALeadDim, kArgXInc,
|
||||
kArgAOffset, kArgXOffset};
|
||||
}
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeX(const Arguments<T> &args) {
|
||||
return args.n * args.x_inc + args.x_offset;
|
||||
}
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
return args.n * args.a_ld + args.a_offset;
|
||||
}
|
||||
|
||||
// Describes how to set the sizes of all the buffers
|
||||
static void SetSizes(Arguments<T> &args) {
|
||||
args.a_size = GetSizeA(args);
|
||||
args.x_size = GetSizeX(args);
|
||||
}
|
||||
|
||||
// Describes what the default values of the leading dimensions of the matrices are
|
||||
static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
|
||||
static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
|
||||
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
|
||||
|
||||
// Describes which transpose options are relevant for this routine
|
||||
using Transposes = std::vector<Transpose>;
|
||||
static Transposes GetATransposes(const Transposes &all) { return all; }
|
||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
PrepareData(args, buffers, queue);
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
|
||||
args.n,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.x_vec(), args.x_offset, args.x_inc,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
}
|
||||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
PrepareData(args, buffers, queue);
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXtrsv<T>(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.triangle),
|
||||
convertToCLBLAS(args.a_transpose),
|
||||
convertToCLBLAS(args.diagonal),
|
||||
args.n,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.x_vec, args.x_offset, args.x_inc,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
PrepareData(args, buffers, queue);
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
|
||||
cblasXtrsv(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.diagonal),
|
||||
args.n,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
x_vec_cpu, args.x_offset, args.x_inc);
|
||||
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> result(args.x_size, static_cast<T>(0));
|
||||
buffers.x_vec.Read(queue, args.x_size, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Describes how to compute the indices of the result buffer
|
||||
static size_t ResultID1(const Arguments<T> &args) {
|
||||
return args.n;
|
||||
}
|
||||
static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
|
||||
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
|
||||
return id1*args.x_inc + args.x_offset;
|
||||
}
|
||||
|
||||
// Describes how to compute performance metrics
|
||||
static size_t GetFlops(const Arguments<T> &args) {
|
||||
return 2 * args.n * args.n;
|
||||
}
|
||||
static size_t GetBytes(const Arguments<T> &args) {
|
||||
return (args.n*args.n + 2*args.n + args.n) * sizeof(T);
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_TEST_ROUTINES_XTRSV_H_
|
||||
#endif
|
194
test/routines/level3/xtrsm.hpp
Normal file
194
test/routines/level3/xtrsm.hpp
Normal file
|
@ -0,0 +1,194 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements a class with static methods to describe the Xtrsm routine. Examples of
|
||||
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
|
||||
// static methods are used by the correctness tester and the performance tester.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_TEST_ROUTINES_XTRSM_H_
|
||||
#define CLBLAST_TEST_ROUTINES_XTRSM_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
#include "test/wrapper_clblas.hpp"
|
||||
#endif
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
#include "test/wrapper_cblas.hpp"
|
||||
#endif
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Prepares the data
|
||||
template <typename T>
|
||||
void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
const auto k = (args.side == Side::kLeft) ? args.m : args.n;
|
||||
if (args.a_ld < k) { return; }
|
||||
if (args.a_size <= 0 || args.b_size <= 0) { return; }
|
||||
|
||||
// Copies input buffers to the host
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
|
||||
// Generates 'proper' input for the TRSM routine
|
||||
// TODO: Improve this
|
||||
for (auto i = size_t{0}; i < k; ++i) {
|
||||
for (auto j = size_t{0}; j < k; ++j) {
|
||||
auto value = a_mat_cpu[j*args.a_ld + i + args.a_offset];
|
||||
value *= ConstantTwo<T>();
|
||||
if (IsCloseToZero(value)) { value += ConstantOne<T>(); }
|
||||
a_mat_cpu[j*args.a_ld + i + args.a_offset] = value;
|
||||
}
|
||||
}
|
||||
|
||||
// Copies input buffers back to the OpenCL device
|
||||
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
return;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class TestXtrsm {
|
||||
public:
|
||||
|
||||
// The BLAS level: 1, 2, or 3
|
||||
static size_t BLASLevel() { return 3; }
|
||||
|
||||
// The list of arguments relevant for this routine
|
||||
static std::vector<std::string> GetOptions() {
|
||||
return {kArgM, kArgN,
|
||||
kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal,
|
||||
kArgALeadDim, kArgBLeadDim,
|
||||
kArgAOffset, kArgBOffset,
|
||||
kArgAlpha};
|
||||
}
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
const auto k = (args.side == Side::kLeft) ? args.m : args.n;
|
||||
return k * args.a_ld + args.a_offset;
|
||||
}
|
||||
static size_t GetSizeB(const Arguments<T> &args) {
|
||||
const auto b_rotated = (args.layout == Layout::kRowMajor);
|
||||
const auto b_two = (b_rotated) ? args.m : args.n;
|
||||
return b_two * args.b_ld + args.b_offset;
|
||||
}
|
||||
|
||||
// Describes how to set the sizes of all the buffers
|
||||
static void SetSizes(Arguments<T> &args) {
|
||||
args.a_size = GetSizeA(args);
|
||||
args.b_size = GetSizeB(args);
|
||||
}
|
||||
|
||||
// Describes what the default values of the leading dimensions of the matrices are
|
||||
static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
|
||||
static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
|
||||
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
|
||||
|
||||
// Describes which transpose options are relevant for this routine
|
||||
using Transposes = std::vector<Transpose>;
|
||||
static Transposes GetATransposes(const Transposes &all) { return all; }
|
||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
PrepareData(args, buffers, queue);
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
|
||||
args.m, args.n, args.alpha,
|
||||
buffers.a_mat(), args.a_offset, args.a_ld,
|
||||
buffers.b_mat(), args.b_offset, args.b_ld,
|
||||
&queue_plain, &event);
|
||||
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
|
||||
return status;
|
||||
}
|
||||
|
||||
// Describes how to run the clBLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CLBLAS
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
PrepareData(args, buffers, queue);
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = clblasXtrsm(convertToCLBLAS(args.layout),
|
||||
convertToCLBLAS(args.side),
|
||||
convertToCLBLAS(args.triangle),
|
||||
convertToCLBLAS(args.a_transpose),
|
||||
convertToCLBLAS(args.diagonal),
|
||||
args.m, args.n, args.alpha,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.b_mat, args.b_offset, args.b_ld,
|
||||
1, &queue_plain, 0, nullptr, &event);
|
||||
clWaitForEvents(1, &event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
|
||||
#ifdef CLBLAST_REF_CBLAS
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
PrepareData(args, buffers, queue);
|
||||
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
|
||||
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
|
||||
cblasXtrsm(convertToCBLAS(args.layout),
|
||||
convertToCBLAS(args.side),
|
||||
convertToCBLAS(args.triangle),
|
||||
convertToCBLAS(args.a_transpose),
|
||||
convertToCBLAS(args.diagonal),
|
||||
args.m, args.n, args.alpha,
|
||||
a_mat_cpu, args.a_offset, args.a_ld,
|
||||
b_mat_cpu, args.b_offset, args.b_ld);
|
||||
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> result(args.b_size, static_cast<T>(0));
|
||||
buffers.b_mat.Read(queue, args.b_size, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Describes how to compute the indices of the result buffer
|
||||
static size_t ResultID1(const Arguments<T> &args) { return args.m; }
|
||||
static size_t ResultID2(const Arguments<T> &args) { return args.n; }
|
||||
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
|
||||
return (args.layout == Layout::kRowMajor) ?
|
||||
id1*args.b_ld + id2 + args.b_offset:
|
||||
id2*args.b_ld + id1 + args.b_offset;
|
||||
}
|
||||
|
||||
// Describes how to compute performance metrics
|
||||
static size_t GetFlops(const Arguments<T> &args) {
|
||||
auto k = (args.side == Side::kLeft) ? args.m : args.n;
|
||||
return args.m * args.n * k;
|
||||
}
|
||||
static size_t GetBytes(const Arguments<T> &args) {
|
||||
auto k = (args.side == Side::kLeft) ? args.m : args.n;
|
||||
return (k*k + 2*args.m*args.n) * sizeof(T);
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_TEST_ROUTINES_XTRSM_H_
|
||||
#endif
|
224
test/routines/levelx/xinvert.hpp
Normal file
224
test/routines/levelx/xinvert.hpp
Normal file
|
@ -0,0 +1,224 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements a class with static methods to describe the Xinvert routine. Examples of
|
||||
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
|
||||
// static methods are used by the correctness tester and the performance tester.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_TEST_ROUTINES_XINVERT_H_
|
||||
#define CLBLAST_TEST_ROUTINES_XINVERT_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "routines/levelx/xinvert.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
template <typename T>
|
||||
StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) ||
|
||||
(args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor));
|
||||
|
||||
// Data transfer from OpenCL to std::vector
|
||||
std::vector<T> a_mat_cpu(args.a_size, T{0.0});
|
||||
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
|
||||
|
||||
// Creates the output buffer
|
||||
std::vector<T> b_mat_cpu(args.b_size, T{0.0});
|
||||
|
||||
// Helper variables
|
||||
const auto block_size = args.m;
|
||||
const auto num_blocks = CeilDiv(args.n, block_size);
|
||||
const auto a_ld = args.a_ld;
|
||||
const auto b_ld = block_size;
|
||||
|
||||
// Checks for valid arguments
|
||||
if ((block_size == 0) || (args.n == 0)) {
|
||||
return StatusCode::kInvalidDimension;
|
||||
}
|
||||
if ((block_size % 16 != 0) || (block_size > 128)) {
|
||||
return StatusCode::kUnknownError;
|
||||
}
|
||||
|
||||
// Loops over the amount of diagonal blocks of size args.m by args.m each
|
||||
for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) {
|
||||
const auto a_offset = block_id * (block_size + a_ld * block_size) + args.a_offset;
|
||||
const auto b_offset = block_id * block_size * block_size;
|
||||
|
||||
// Inverts the diagonal elements of the matrix
|
||||
for (auto i = size_t{0}; i < block_size; ++i) {
|
||||
auto a_value = T{1.0};
|
||||
if (args.diagonal == Diagonal::kNonUnit) {
|
||||
if (i + block_id * block_size < args.n) {
|
||||
if (a_mat_cpu[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; }
|
||||
a_value = T{1.0} / a_mat_cpu[i * a_ld + i + a_offset];
|
||||
}
|
||||
}
|
||||
b_mat_cpu[i * b_ld + i + b_offset] = a_value;
|
||||
}
|
||||
|
||||
// Inverts the upper triangle row by row
|
||||
if (is_upper) {
|
||||
for (int i = static_cast<int>(block_size) - 2; i >= 0; --i) {
|
||||
for (auto j = static_cast<int>(block_size) - 1; j > i; --j) {
|
||||
auto sum = T{0.0};
|
||||
for (auto k = i + 1; k <= j; ++k) {
|
||||
auto a_value = T{0.0};
|
||||
if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
|
||||
a_value = a_mat_cpu[k * a_ld + i + a_offset];
|
||||
}
|
||||
sum += a_value * b_mat_cpu[j * b_ld + k + b_offset];
|
||||
}
|
||||
b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Inverts the lower triangle row by row
|
||||
else {
|
||||
for (auto i = size_t{1}; i < block_size; ++i) {
|
||||
for (auto j = size_t{0}; j < i; ++j) {
|
||||
auto sum = T{0.0};
|
||||
for (auto k = j; k < i; ++k) {
|
||||
auto a_value = T{0.0};
|
||||
if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
|
||||
a_value = a_mat_cpu[k * a_ld + i + a_offset];
|
||||
}
|
||||
sum += a_value * b_mat_cpu[j * b_ld + k + b_offset];
|
||||
}
|
||||
b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Data transfer back to OpenCL
|
||||
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Half-precision version calling the above reference implementation after conversions
|
||||
template <>
|
||||
StatusCode RunReference<half>(const Arguments<half> &args, Buffers<half> &buffers, Queue &queue) {
|
||||
auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue());
|
||||
auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue());
|
||||
auto dummy = clblast::Buffer<float>(0);
|
||||
auto buffers2 = Buffers<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
|
||||
auto args2 = Arguments<float>();
|
||||
args2.a_size = args.a_size; args2.b_size = args.b_size;
|
||||
args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n;
|
||||
args2.a_offset = args.a_offset;
|
||||
args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal;
|
||||
auto status = RunReference(args2, buffers2, queue);
|
||||
FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue());
|
||||
return status;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// See comment at top of file for a description of the class
|
||||
template <typename T>
|
||||
class TestXinvert {
|
||||
public:
|
||||
|
||||
// The BLAS level: 4 for the extra routines
|
||||
static size_t BLASLevel() { return 4; }
|
||||
|
||||
// The list of arguments relevant for this routine
|
||||
static std::vector<std::string> GetOptions() {
|
||||
return {kArgN, kArgM,
|
||||
kArgLayout, kArgTriangle, kArgDiagonal,
|
||||
kArgALeadDim, kArgAOffset};
|
||||
}
|
||||
|
||||
// Describes how to obtain the sizes of the buffers
|
||||
static size_t GetSizeA(const Arguments<T> &args) {
|
||||
return args.n * args.a_ld + args.a_offset;
|
||||
}
|
||||
static size_t GetSizeB(const Arguments<T> &args) {
|
||||
const auto block_size = args.m;
|
||||
const auto num_blocks = CeilDiv(args.n, block_size);
|
||||
return num_blocks * block_size * block_size;
|
||||
}
|
||||
|
||||
// Describes how to set the sizes of all the buffers
|
||||
static void SetSizes(Arguments<T> &args) {
|
||||
args.a_size = GetSizeA(args);
|
||||
args.b_size = GetSizeB(args);
|
||||
}
|
||||
|
||||
// Describes what the default values of the leading dimensions of the matrices are
|
||||
static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
|
||||
static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
|
||||
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
|
||||
|
||||
// Describes which omatcopyose options are relevant for this routine
|
||||
using Transposes = std::vector<Transpose>;
|
||||
static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
|
||||
|
||||
// Describes how to run the CLBlast routine
|
||||
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
try {
|
||||
auto event = cl_event{};
|
||||
auto inverter = Xinvert<T>(queue, &event);
|
||||
inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal,
|
||||
args.n, args.m,
|
||||
buffers.a_mat, args.a_offset, args.a_ld,
|
||||
buffers.b_mat);
|
||||
clWaitForEvents(1, &event);
|
||||
clReleaseEvent(event);
|
||||
} catch (...) { return DispatchException(); }
|
||||
return StatusCode::kSuccess;
|
||||
}
|
||||
|
||||
// Describes how to run a naive version of the routine (for correctness/performance comparison).
|
||||
// Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
|
||||
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers, queue);
|
||||
}
|
||||
|
||||
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
return RunReference(args, buffers, queue);
|
||||
}
|
||||
|
||||
// Describes how to download the results of the computation (more importantly: which buffer)
|
||||
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
|
||||
std::vector<T> result(args.b_size, static_cast<T>(0));
|
||||
buffers.b_mat.Read(queue, args.b_size, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Describes how to compute the indices of the result buffer
|
||||
static size_t ResultID1(const Arguments<T> &args) { return args.m; }
|
||||
static size_t ResultID2(const Arguments<T> &args) { return Ceil(args.n, args.m); }
|
||||
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
|
||||
return id1 * Ceil(args.n, args.m) + id2;
|
||||
}
|
||||
|
||||
// Describes how to compute performance metrics
|
||||
static size_t GetFlops(const Arguments<T> &args) {
|
||||
const auto block_size = args.m;
|
||||
const auto num_blocks = CeilDiv(args.n, block_size);
|
||||
return num_blocks * (block_size * (block_size / 2) * (block_size / 2));
|
||||
}
|
||||
static size_t GetBytes(const Arguments<T> &args) {
|
||||
return (args.a_size * args.b_size) * sizeof(T);
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_TEST_ROUTINES_XINVERT_H_
|
||||
#endif
|
|
@ -2103,20 +2103,6 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
|
|||
reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
|
||||
reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
|
||||
}
|
||||
void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const half alpha,
|
||||
const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld) {
|
||||
auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
|
||||
auto b_buffer_bis = HalfToFloatBuffer(b_buffer);
|
||||
cblasXtrsm(layout, side, triangle, a_transpose, diagonal,
|
||||
m, n,
|
||||
HalfToFloat(alpha),
|
||||
a_buffer_bis, a_offset, a_ld,
|
||||
b_buffer_bis, b_offset, b_ld);
|
||||
FloatToHalfBuffer(b_buffer, b_buffer_bis);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
|
|
@ -2865,24 +2865,6 @@ clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const
|
|||
b_buffer(), b_offset, b_ld,
|
||||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
}
|
||||
clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const half alpha,
|
||||
const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_uint num_queues, cl_command_queue *queues,
|
||||
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
|
||||
auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
|
||||
auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]);
|
||||
auto status = clblasXtrsm(layout, side, triangle, a_transpose, diagonal,
|
||||
m, n,
|
||||
HalfToFloat(alpha),
|
||||
a_buffer_bis, a_offset, a_ld,
|
||||
b_buffer_bis, b_offset, b_ld,
|
||||
num_queues, queues, num_wait_events, wait_events, events);
|
||||
FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]);
|
||||
return status;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
|
Loading…
Reference in a new issue