Merge pull request #138 from CNugteren/triangular_solvers

Added the triangular solvers (TRSV/TRSM)
This commit is contained in:
Cedric Nugteren 2017-02-26 16:26:41 +01:00 committed by GitHub
commit 7de7e7d8ed
41 changed files with 2487 additions and 157 deletions

View file

@ -8,6 +8,10 @@ Development version (next release)
- Added the OverrideParameters function to the API to be able to supply custom tuning parmeters
- Various minor fixes and enhancements
- Added tuned parameters for various devices (see README)
- Added level-2 routines:
* STRSV/DTRSV/CTRSV/ZTRSV (experimental, un-optimized)
- Added level-3 routines:
* STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized)
Version 0.10.0
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header

View file

@ -156,9 +156,9 @@ if(NETLIB)
set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
endif()
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
set(LEVELX_ROUTINES xomatcopy)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
set(PRECISIONS 32 64 3232 6464 16)
@ -175,6 +175,7 @@ set(SOURCES
src/clblast.cpp
src/clblast_c.cpp
src/routine.cpp
src/routines/levelx/xinvert.cpp # only source, don't include it as a test
)
if(NETLIB)
set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)

View file

@ -258,6 +258,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
| xSPR | ✔ | ✔ | - | - | ✔ |
| xSYR2 | ✔ | ✔ | - | - | ✔ |
| xSPR2 | ✔ | ✔ | - | - | ✔ |
| xTRSV | ✔ | ✔ | ✔ | ✔ | | (experimental, un-optimized)
| Level-3 | S | D | C | Z | H |
| ---------|---|---|---|---|---|
@ -269,6 +270,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
| xSYR2K | ✔ | ✔ | ✔ | ✔ | ✔ |
| xHER2K | - | - | ✔ | ✔ | - |
| xTRMM | ✔ | ✔ | ✔ | ✔ | ✔ |
| xTRSM | ✔ | ✔ | ✔ | ✔ | | (experimental, un-optimized)
In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:
@ -279,7 +281,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
| IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ |
| xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ |
Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTRSV, xTBSV, xTPSV, and xTRSM.
Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.
Half precision (fp16)

View file

@ -1445,6 +1445,63 @@ Arguments to TPMV:
xTRSV: Solves a triangular system of equations
-------------
C++ API:
```
template <typename T>
StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
C API:
```
CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to TRSV:
* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
* `const size_t n`: Integer size argument. This value must be positive.
* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
* `const size_t x_offset`: The offset in elements from the start of the output x vector.
* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
xGER: General rank-1 matrix update
-------------
@ -2708,6 +2765,71 @@ Requirements for TRMM:
xTRSM: Solves a triangular system of equations
-------------
Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.
C++ API:
```
template <typename T>
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
```
C API:
```
CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const float alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const double alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const cl_float2 alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const cl_double2 alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
```
Arguments to TRSM:
* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
* `const size_t m`: Integer size argument. This value must be positive.
* `const size_t n`: Integer size argument. This value must be positive.
* `const T alpha`: Input scalar constant.
* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function)
-------------

View file

@ -587,7 +587,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event = nullptr);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
template <typename T>
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,

View file

@ -1265,7 +1265,7 @@ CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBl
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const float alpha,
@ -1290,12 +1290,6 @@ CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBl
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
// =================================================================================================
// Extra non-BLAS routines (level-X)

View file

@ -862,7 +862,7 @@ void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side,
const void* a, const int a_ld,
void* b, const int b_ld);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int m, const int n,
const float alpha,

View file

@ -41,8 +41,8 @@ FILES = [
"/include/clblast_netlib_c.h",
"/src/clblast_netlib_c.cpp",
]
HEADER_LINES = [121, 73, 125, 23, 29, 41, 65, 32]
FOOTER_LINES = [25, 139, 27, 38, 6, 6, 9, 2]
HEADER_LINES = [121, 75, 125, 23, 29, 41, 65, 32]
FOOTER_LINES = [25, 138, 27, 38, 6, 6, 9, 2]
HEADER_LINES_DOC = 0
FOOTER_LINES_DOC = 63
@ -131,7 +131,7 @@ ROUTINES = [
Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []),
Routine(True, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []),
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []),
# Level 2: matrix update
@ -156,7 +156,7 @@ ROUTINES = [
Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "", []),
Routine(True, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []),
],
[ # Level X: extra routines (not part of BLAS)
Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),

View file

@ -45,6 +45,7 @@
#include "routines/level2/xtrmv.hpp"
#include "routines/level2/xtbmv.hpp"
#include "routines/level2/xtpmv.hpp"
#include "routines/level2/xtrsv.hpp"
#include "routines/level2/xger.hpp"
#include "routines/level2/xgeru.hpp"
#include "routines/level2/xgerc.hpp"
@ -66,6 +67,7 @@
#include "routines/level3/xsyr2k.hpp"
#include "routines/level3/xher2k.hpp"
#include "routines/level3/xtrmm.hpp"
#include "routines/level3/xtrsm.hpp"
// Level-x includes (non-BLAS)
#include "routines/levelx/xomatcopy.hpp"
@ -1145,12 +1147,20 @@ template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Tr
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
template <typename T>
StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
try {
auto queue_cpp = Queue(*queue);
auto routine = Xtrsv<T>(queue_cpp, event);
routine.DoTrsv(layout, triangle, a_transpose, diagonal,
n,
Buffer<T>(a_buffer), a_offset, a_ld,
Buffer<T>(x_buffer), x_offset, x_inc);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
@ -2065,15 +2075,24 @@ template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triang
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
template <typename T>
StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
const T,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
try {
auto queue_cpp = Queue(*queue);
auto routine = Xtrsm<T>(queue_cpp, event);
routine.DoTrsm(layout, side, triangle, a_transpose, diagonal,
m, n,
alpha,
Buffer<T>(a_buffer), a_offset, a_ld,
Buffer<T>(b_buffer), b_offset, b_ld);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
@ -2099,12 +2118,6 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// Extra non-BLAS routines (level-X)
@ -2159,7 +2172,6 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// Clears the cache of stored binaries

View file

@ -3350,27 +3350,6 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Trsm(static_cast<clblast::Layout>(layout),
static_cast<clblast::Side>(side),
static_cast<clblast::Triangle>(triangle),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Diagonal>(diagonal),
m, n,
alpha,
a_buffer, a_offset, a_ld,
b_buffer, b_offset, b_ld,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// =================================================================================================
// Extra non-BLAS routines (level-X)

View file

@ -20,12 +20,14 @@
#include "database/kernels/xgemv_fast.hpp"
#include "database/kernels/xgemv_fast_rot.hpp"
#include "database/kernels/xger.hpp"
#include "database/kernels/xtrsv.hpp"
#include "database/kernels/xgemm.hpp"
#include "database/kernels/xgemm_direct.hpp"
#include "database/kernels/copy.hpp"
#include "database/kernels/pad.hpp"
#include "database/kernels/transpose.hpp"
#include "database/kernels/padtranspose.hpp"
#include "database/kernels/invert.hpp"
#include "database/kernel_selection.hpp"
namespace clblast {
@ -39,12 +41,14 @@ const std::vector<const Database::DatabaseEntry*> Database::database = {
&database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
&database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
&database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
&database::XtrsvHalf, &database::XtrsvSingle, &database::XtrsvDouble, &database::XtrsvComplexSingle, &database::XtrsvComplexDouble,
&database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
&database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
&database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
&database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
&database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
&database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
&database::InvertHalf, &database::InvertSingle, &database::InvertDouble, &database::InvertComplexSingle, &database::InvertComplexDouble,
&database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
};

View file

@ -0,0 +1,78 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// Tuning parameters for the diagonal matrix inversion kernels
//
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry InvertHalf = {
"Invert", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertSingle = {
"Invert", Precision::kSingle, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertComplexSingle = {
"Invert", Precision::kComplexSingle, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertDouble = {
"Invert", Precision::kDouble, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertComplexDouble = {
"Invert", Precision::kComplexDouble, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
} // namespace database
} // namespace clblast

View file

@ -0,0 +1,78 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file populates the database with best-found tuning parameters for the 'Xtrsv' kernels.
//
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry XtrsvHalf = {
"Xtrsv", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XtrsvSingle = {
"Xtrsv", Precision::kSingle, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XtrsvComplexSingle = {
"Xtrsv", Precision::kComplexSingle, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XtrsvDouble = {
"Xtrsv", Precision::kDouble, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XtrsvComplexDouble = {
"Xtrsv", Precision::kComplexDouble, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
}
},
}
};
// =================================================================================================
} // namespace database
} // namespace clblast

View file

@ -160,6 +160,13 @@ R"(
#define AbsoluteValue(value) value = fabs(value)
#endif
// Negation (component-wise)
#if PRECISION == 3232 || PRECISION == 6464
#define Negate(value) value.x = -(value.x); value.y = -(value.y)
#else
#define Negate(value) value = -(value)
#endif
// Adds two complex variables
#if PRECISION == 3232 || PRECISION == 6464
#define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
@ -167,6 +174,13 @@ R"(
#define Add(c, a, b) c = a + b
#endif
// Subtracts two complex variables
#if PRECISION == 3232 || PRECISION == 6464
#define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y
#else
#define Subtract(c, a, b) c = a - b
#endif
// Multiply two complex variables (used in the defines below)
#if PRECISION == 3232 || PRECISION == 6464
#define MulReal(a, b) a.x*b.x - a.y*b.y
@ -191,6 +205,20 @@ R"(
#endif
#endif
// The scalar multiply-subtract function
#if PRECISION == 3232 || PRECISION == 6464
#define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b)
#else
#define MultiplySubtract(c, a, b) c -= a * b
#endif
// The scalar division function: full division
#if PRECISION == 3232 || PRECISION == 6464
#define DivideFull(c, a, b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom
#else
#define DivideFull(c, a, b) c = a / b
#endif
// The scalar AXPBY function
#if PRECISION == 3232 || PRECISION == 6464
#define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)

View file

@ -0,0 +1,144 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
#if defined(ROUTINE_TRSV)
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void FillVector(const int n, const int inc, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);
const int tid = get_global_id(0);
if (tid < n) {
dest[tid*inc + offset] = value;
}
}
// =================================================================================================
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef TRSV_BLOCK_SIZE
#define TRSV_BLOCK_SIZE 32 // The block size for forward or backward substition
#endif
// =================================================================================================
__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
void trsv_forward(int n,
const __global real *A, const int a_offset, int a_ld,
__global real *b, const int b_offset, int b_inc,
__global real *x, const int x_offset, int x_inc,
const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
__local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
__local real xlm[TRSV_BLOCK_SIZE];
const int tid = get_local_id(0);
// Pre-loads the data into local memory
if (tid < n) {
Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
if (is_transposed == 0) {
for (int i = 0; i < n; ++i) {
alm[i][tid] = A[i + tid*a_ld + a_offset];
}
}
else {
for (int i = 0; i < n; ++i) {
alm[i][tid] = A[tid + i*a_ld + a_offset];
}
}
if (do_conjugate) {
for (int i = 0; i < n; ++i) {
COMPLEX_CONJUGATE(alm[i][tid]);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Computes the result (single-threaded for now)
if (tid == 0) {
for (int i = 0; i < n; ++i) {
for (int j = 0; j < i; ++j) {
MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
}
if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Stores the results
if (tid < n) {
x[tid*x_inc + x_offset] = xlm[tid];
}
}
__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
void trsv_backward(int n,
const __global real *A, const int a_offset, int a_ld,
__global real *b, const int b_offset, int b_inc,
__global real *x, const int x_offset, int x_inc,
const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
__local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
__local real xlm[TRSV_BLOCK_SIZE];
const int tid = get_local_id(0);
// Pre-loads the data into local memory
if (tid < n) {
Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
if (is_transposed == 0) {
for (int i = 0; i < n; ++i) {
alm[i][tid] = A[i + tid*a_ld + a_offset];
}
}
else {
for (int i = 0; i < n; ++i) {
alm[i][tid] = A[tid + i*a_ld + a_offset];
}
}
if (do_conjugate) {
for (int i = 0; i < n; ++i) {
COMPLEX_CONJUGATE(alm[i][tid]);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Computes the result (single-threaded for now)
if (tid == 0) {
for (int i = n - 1; i >= 0; --i) {
for (int j = i + 1; j < n; ++j) {
MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
}
if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Stores the results
if (tid < n) {
x[tid*x_inc + x_offset] = xlm[tid];
}
}
#endif
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -0,0 +1,429 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
// and Jack Dongarra.
//
// =================================================================================================
//
// Let A be an block_size*block_size lower triangular matrix, and B its inverse.
// Then the block decomposition
//
// [ A11 0 ] * [ B11 0 ] = [ I 0 ]
// [ A21 A22 ] [ B21 B22 ] [ 0 I ]
//
// yields
//
// A11*B11 = I ==> B11 = A11^{-1},
// A22*B22 = I ==> B22 = A22^{-1},
// A21*B11 + A22*B21 = 0 ==> B21 = -A22^{-1}*A21*B11 = -B22*A21*B11.
//
// The InvertDiagonalBlock kernel inverts A11 and A22.
// The TripleMatMul routines multiply:
// part 1: B21 = A21 * B11,
// part 2: B21 = -B22 * B21.
//
// At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each
// submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2
// at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block,
// here current_size*2=32,
// [ B11 0 ]
// which contains [ B21 B22 ].
// Outer blocks are block_size x block_size.
//
// A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on
// the right. This makes a single check easy to do.
//
// B is stored in workspace that is a full multiple of block_size x block_size; no checks needed.
//
// We split this into part1 & part2 to synchronize all blocks and make sure
// that writes to B12 are observed by all blocks.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
#if defined(ROUTINE_INVERT)
#define LOCALX 17 // 16 + 1 to avoid bank conflicts
#define LOCALY 16
// =================================================================================================
// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
__global real* restrict dest, const int outer_block_size,
const int unit_diagonal, const int is_upper)
{
const int thread_index = get_local_id(0);
const int block_index = get_group_id(0);
// Sets the offset for this particular block in the source and destination matrices
const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
(block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
// Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
__local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
// Loads the source lower triangle into local memory. Any values in the upper triangle or
// outside of the matrix are set to zero
#pragma unroll
for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
(thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
if (condition) {
lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
}
else {
SetToZero(lm[thread_index][j]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Inverts the diagonal
real inverted_diagonal;
SetToOne(inverted_diagonal);
if (unit_diagonal == 0) {
const real diagonal_value = lm[thread_index][thread_index];
if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
real constant_one;
SetToOne(constant_one);
DivideFull(inverted_diagonal, constant_one, diagonal_value);
}
}
lm[thread_index][thread_index] = inverted_diagonal;
barrier(CLK_LOCAL_MEM_FENCE);
// Upper-triangular
if (is_upper) {
// Computes the elements 0:j-1 of the j-th column
for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) {
if (thread_index < j) {
real sum;
SetToZero(sum);
#pragma unroll
for (int k = 0; k < j; ++k) {
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
}
real diagonal_value = lm[j][j];
Negate(diagonal_value);
Multiply(lm[thread_index][j], diagonal_value, sum);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Lower triangular
else {
// Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column
for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) {
if (thread_index > j) {
real sum;
SetToZero(sum);
#pragma unroll
for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
}
Multiply(lm[thread_index][j], -lm[j][j], sum);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Writes the result to global memory
#pragma unroll
for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
}
}
// =================================================================================================
// Triple matrix-multiplication kernel: C = A * B
inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
__global const real* agm, __global const real* bgm, __global real* cgm,
const int lda, const int ldb, const int ldc,
int current_size, int num_pages, const int block_size) {
// Emulates a 3D grid: NX * (NY * num_pages)
const int by = get_group_id(1) / num_pages;
const int page = get_group_id(1) % num_pages;
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1));
const int iby = by*16;
const int id = lidx + lidy*get_local_size(0);
const int row = page*current_size*2 + current_size + ibx + id;
int col = page*current_size*2 + current_size;
// Sets the offsets for this specific thread
agm += ibx + id;
bgm += lidx + (iby + lidy)*ldb;
cgm += ibx + id + iby*ldc;
// Initializes the result registers
real cpm[16];
#pragma unroll
for (int j = 0; j < 16; ++j) {
SetToZero(cpm[j]);
}
// Computes NT x 16 block of C, each thread computes one 1 x 16 row
for (int k = 0; k < current_size; k += 16) {
// Loads a 16 x 16 block of B into local memory using NX x 4 threads
#pragma unroll
for( int i=0; i < 16; i += (size/4) ) { // += get_local_size(0)
#pragma unroll
for( int j=0; j < 16; j += 4 ) { // += get_local_size(1)
blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Upper triangular
if (upper) {
// Performs 16 x 16 multiply-add operations
#pragma unroll
for (int i = 0; i < 16; ++i) {
if (part == 2 || col++ < n) {
#pragma unroll
for (int j = 0; j < 16; ++j) {
MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
}
}
}
}
// Lower triangular
else {
if (row < n) {
// Performs 16 x 16 multiply-add operations
#pragma unroll
for (int i = 0; i < 16; ++i) {
#pragma unroll
for (int j = 0; j < 16; ++j) {
MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores NT x 16 results: each thread writes one 16 x 1 row
#pragma unroll
for (int i = 0; i < 16; ++i) {
if (part == 2) { Negate(cpm[i]); }
cgm[0] = cpm[i];
cgm += ldc;
}
}
// =================================================================================================
// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
__global const real* src, const int a_offset, const int lda,
__global real* dest, int current_size, int num_pages, const int block_size) {
// Emulates a 3D grid: NX * (NY * num_pages)
const int page = get_group_id(1) % num_pages;
// Computes the destination block offset:
// - go to the (page / pages_per_block) outer block_size * block_size block
// - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
const int pages_per_block = block_size / (current_size*2);
dest += (page / pages_per_block) * block_size * block_size +
(page % pages_per_block) * (current_size*2*block_size + current_size*2);
// Using the GEMM notation: C = A*B
__global const real* agm;
__global const real* bgm;
__global real* cgm;
if (upper) { // upper triangular: B12 = A12 * B22
agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda; // A12
bgm = dest + current_size*block_size + current_size; // B22
cgm = dest + current_size*block_size; // B12
}
else { // lower triangular: B21 = A21 * B11
agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size; // A21
bgm = dest; // B11
cgm = dest + current_size; // B21
}
// Runs the generic C = A * B matrix multiplication
const int ldb = block_size;
const int ldc = block_size;
TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
}
// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
__global real* dest, int current_size, int num_pages, const int block_size) {
// Emulates a 3D grid: NX * (NY * num_pages)
const int page = get_group_id(1) % num_pages;
// Computes the destination block offset:
// - go to the (page / pages_per_block) outer block_size * block_size block
// - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
const int pages_per_block = block_size / (current_size*2);
dest += (page / pages_per_block) * block_size * block_size +
(page % pages_per_block) * (current_size*2*block_size + current_size*2);
// Using the GEMM notation: C = A*B
__global const real* agm;
__global const real* bgm;
__global real* cgm;
if (upper) { // upper triangular: B12 = -B11 * B12
agm = dest; // B11
cgm = dest + current_size*block_size; // B12
bgm = cgm; // B12, okay to overwrite
}
else { // lower triangular: B21 = -B22 * B21
agm = dest + current_size*block_size + current_size; // B22
cgm = dest + current_size; // B21
bgm = cgm; // B21, okay to overwrite
}
// Runs the generic C = A * B matrix multiplication
const int lda = block_size;
const int ldb = block_size;
const int ldc = block_size;
TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
}
// =================================================================================================
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
}
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
}
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
}
// =================================================================================================
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
}
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
}
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
}
#endif
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -73,6 +73,22 @@ R"(
#define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts
#endif
// =================================================================================================
#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
void FillMatrix(const int m, const int n, const int ld, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);
const int id_one = get_global_id(0);
const int id_two = get_global_id(1);
if (id_one < m && id_two < n) {
dest[id_two*ld + id_one + offset] = value;
}
}
#endif
// =================================================================================================
// End of the C++11 raw string literal

View file

@ -25,16 +25,17 @@ namespace clblast {
const std::vector<std::string> Routine::routines_axpy = {"AXPY", "COPY", "SCAL", "SWAP"};
const std::vector<std::string> Routine::routines_dot = {"AMAX", "ASUM", "DOT", "DOTC", "DOTU", "MAX", "MIN", "NRM2", "SUM"};
const std::vector<std::string> Routine::routines_ger = {"GER", "GERC", "GERU", "HER", "HER2", "HPR", "HPR2", "SPR", "SPR2", "SYR", "SYR2"};
const std::vector<std::string> Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV"};
const std::vector<std::string> Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV", "TRSV"};
const std::vector<std::string> Routine::routines_gemm = {"GEMM", "HEMM", "SYMM", "TRMM"};
const std::vector<std::string> Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM"};
const std::vector<std::string> Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM", "TRSM"};
const std::vector<std::string> Routine::routines_trsm = {"TRSM"};
const std::unordered_map<std::string, const std::vector<std::string>> Routine::routines_by_kernel = {
{"Xaxpy", routines_axpy},
{"Xdot", routines_dot},
{"Xgemv", routines_gemv},
{"XgemvFast", routines_gemv},
{"XgemvFastRot", routines_gemv},
{"Xgemv", {}},
{"Xtrsv", routines_gemv},
{"Xger", routines_ger},
{"Copy", routines_gemm_syrk},
{"Pad", routines_gemm_syrk},
@ -43,6 +44,7 @@ const std::unordered_map<std::string, const std::vector<std::string>> Routine::r
{"Xgemm", routines_gemm_syrk},
{"XgemmDirect", routines_gemm},
{"KernelSelection", routines_gemm},
{"Invert", routines_trsm},
};
// =================================================================================================

View file

@ -50,6 +50,7 @@ class Routine {
static const std::vector<std::string> routines_gemv;
static const std::vector<std::string> routines_gemm;
static const std::vector<std::string> routines_gemm_syrk;
static const std::vector<std::string> routines_trsm;
static const std::unordered_map<std::string, const std::vector<std::string>> routines_by_kernel;
private:

View file

@ -33,6 +33,47 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// =================================================================================================
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
const Program &program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t m, const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
const T constant_value) {
auto kernel = Kernel(program, "FillMatrix");
kernel.SetArgument(0, static_cast<int>(m));
kernel.SetArgument(1, static_cast<int>(n));
kernel.SetArgument(2, static_cast<int>(ld));
kernel.SetArgument(3, static_cast<int>(offset));
kernel.SetArgument(4, dest());
kernel.SetArgument(5, GetRealArg(constant_value));
auto local = std::vector<size_t>{8, 8};
auto global = std::vector<size_t>{Ceil(m, 8), Ceil(n, 8)};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
// Sets all elements of a vector to a constant value
template <typename T>
void FillVector(Queue &queue, const Device &device,
const Program &program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t inc, const size_t offset,
const Buffer<T> &dest,
const T constant_value) {
auto kernel = Kernel(program, "FillVector");
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, static_cast<int>(inc));
kernel.SetArgument(2, static_cast<int>(offset));
kernel.SetArgument(3, dest());
kernel.SetArgument(4, GetRealArg(constant_value));
auto local = std::vector<size_t>{64};
auto global = std::vector<size_t>{Ceil(n, 64)};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
// to write to symmetric and triangular matrices through optional arguments.
template <typename T>

View file

@ -22,9 +22,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot", "Xtrsv"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/xgemv.opencl"
#include "../../kernels/level2/xgemv_fast.opencl"
#include "../../kernels/level2/xtrsv.opencl"
}) {
}

View file

@ -0,0 +1,161 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xtrsv class (see the header for information about the class).
//
// =================================================================================================
#include "routines/level2/xtrsv.hpp"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
// =================================================================================================
template <typename T>
void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };
// Translates CLBlast arguments to 0/1 integers for the OpenCL kernel
const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
(a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0;
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
(triangle == Triangle::kLower && a_transpose != Transpose::kNo));
// Retrieves the kernel from the compiled binary
const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, a_buffer());
kernel.SetArgument(2, static_cast<int>(a_offset));
kernel.SetArgument(3, static_cast<int>(a_ld));
kernel.SetArgument(4, b_buffer());
kernel.SetArgument(5, static_cast<int>(b_offset));
kernel.SetArgument(6, static_cast<int>(b_inc));
kernel.SetArgument(7, x_buffer());
kernel.SetArgument(8, static_cast<int>(x_offset));
kernel.SetArgument(9, static_cast<int>(x_inc));
kernel.SetArgument(10, static_cast<int>(is_transposed));
kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
kernel.SetArgument(12, static_cast<int>(do_conjugate));
// Launches the kernel
const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
const auto global = std::vector<size_t>{1};
auto event = Event();
RunKernel(kernel, queue_, device_, global, local, event.pointer());
event.WaitForCompletion();
}
// =================================================================================================
// The main routine
template <typename T>
void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the matrix and vector
TestMatrixA(n, n, a_buffer, a_offset, a_ld);
TestVectorX(n, b_buffer, b_offset, b_inc);
// Creates a copy of B to avoid overwriting input while computing output
// TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels
const auto x_offset = b_offset;
const auto x_inc = b_inc;
const auto x_size = n*x_inc + x_offset;
auto x_buffer = Buffer<T>(context_, x_size);
b_buffer.CopyTo(queue_, x_size, x_buffer);
// Fills the output buffer with zeros
auto eventWaitList = std::vector<Event>();
auto fill_vector_event = Event();
FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList,
n, x_inc, x_offset, x_buffer, ConstantZero<T>());
fill_vector_event.WaitForCompletion();
// Derives properties based on the arguments
const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
(triangle == Triangle::kLower && a_transpose != Transpose::kNo));
const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) ||
(layout != Layout::kColMajor && a_transpose != Transpose::kNo));
// Loops over the blocks
auto col = n; // the initial column position
for (auto i = size_t{0}; i < n; i += db_["TRSV_BLOCK_SIZE"]) {
const auto block_size = std::min(db_["TRSV_BLOCK_SIZE"], n - i);
// Sets the next column position
col = (is_upper) ? col - block_size : i;
// Sets the offsets for upper or lower triangular
const auto extra_offset_a = (is_transposed) ?
(is_upper ? col + (col+block_size)*a_ld : col) :
(is_upper ? col+block_size + col*a_ld : col*a_ld);
const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0;
const auto extra_offset_b = col*x_inc;
// Runs the GEMV routine to compute x' = A * x
if (i > 0) {
const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i;
const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size;
DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
a_buffer, a_offset + extra_offset_a, a_ld,
x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
x_buffer, x_offset + extra_offset_b, x_inc );
}
// Runs the triangular substitution for the block size
Substitution(layout, triangle, a_transpose, diagonal, block_size,
a_buffer, a_offset + col + col*a_ld, a_ld,
b_buffer, b_offset + col*b_inc, b_inc,
x_buffer, x_offset + col*x_inc, x_inc);
}
// Retrieves the results
x_buffer.CopyTo(queue_, x_size, b_buffer);
}
// =================================================================================================
// Compiles the templated class
template class Xtrsv<half>;
template class Xtrsv<float>;
template class Xtrsv<double>;
template class Xtrsv<float2>;
template class Xtrsv<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,60 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular
// forward and backward substitutions on the diagonal parts of the matrix in combination with larger
// GEMV computation on the remainder of the matrix.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XTRSV_H_
#define CLBLAST_ROUTINES_XTRSV_H_
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xtrsv: public Xgemv<T> {
public:
// Uses the generic matrix-vector routine
using Xgemv<T>::queue_;
using Xgemv<T>::context_;
using Xgemv<T>::device_;
using Xgemv<T>::db_;
using Xgemv<T>::program_;
using Xgemv<T>::DoGemv;
// Constructor
Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV");
// Templated-precision implementation of the routine
void DoTrsv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
// Performs forward or backward substitution on a small triangular matrix
void Substitution(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XTRSV_H_
#endif

View file

@ -33,10 +33,11 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/convert_symmetric.opencl"
#include "../../kernels/level3/convert_triangular.opencl"
#include "../../kernels/level3/convert_hermitian.opencl"
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_direct_part1.opencl"
#include "../../kernels/level3/xgemm_direct_part2.opencl"
#include "../../kernels/level3/xgemm_direct_part3.opencl"
, // separated in two parts to prevent C1091 in MSVC 2013
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"

View file

@ -0,0 +1,227 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
// and Jack Dongarra and the OpenCL implementation in clBLAS.
//
// =================================================================================================
#include "routines/level3/xtrsm.hpp"
#include "routines/levelx/xinvert.hpp"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name):
Xgemm<T>(queue, event, name) {
}
// =================================================================================================
// The entry point: transforming into col-major (if needed) and then running the col-major version
template <typename T>
void Xtrsm<T>::DoTrsm(const Layout layout, Side side, Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
size_t m, size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
// Converts row-major to a col-major problem:
// The idea is that
// B = A*X
// can be computed as
// B' = (A*X)' = X'*A'
// Since changing the order is basically a transpose on each matrix, the formula becomes:
// B = X*A
// So only the side (left/right) and the triangle (upper/lower) are changed and M/N are swapped
if (layout == Layout::kRowMajor) {
std::swap(m, n);
side = (side == Side::kLeft) ? Side::kRight : Side::kLeft;
triangle = (triangle == Triangle::kLower) ? Triangle::kUpper : Triangle::kLower;
}
// Runs the col-major version of TRSM
TrsmColMajor(side, triangle, a_transpose, diagonal,
m, n, alpha,
a_buffer, a_offset, a_ld,
b_buffer, b_offset, b_ld);
}
// =================================================================================================
// The main routine
template <typename T>
void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
// Settings
constexpr auto block_size = size_t{32}; // tuneable
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes the k dimension. This is based on whether or not matrix is A (on the left)
// or B (on the right) in the Xgemm routine.
const auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the triangular A matrix
TestMatrixA(k, k, a_buffer, a_offset, a_ld);
// Checks for validity of the input B matrix
TestMatrixB(m, n, b_buffer, b_offset, b_ld);
// Creates a copy of B to avoid overwriting input in GEMM while computing output
const auto b_size = b_ld * (n - 1) + m + b_offset;
const auto x_one = m;
const auto x_two = n;
const auto x_size = b_size;
const auto x_ld = b_ld;
const auto x_offset = b_offset;
auto x_buffer = Buffer<T>(context_, x_size);
b_buffer.CopyTo(queue_, x_size, x_buffer);
// Temporary buffer for the inverse of the A matrix
const auto a_inv_size = Ceil(k, block_size) * block_size;
auto a_inv_buffer = Buffer<T>(context_, a_inv_size);
// Fills the output buffer with zeros
auto eventWaitList = std::vector<Event>();
auto fill_matrix_event = Event();
FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList,
x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero<T>());
fill_matrix_event.WaitForCompletion();
// Inverts the diagonal blocks
auto diagonal_invert_event = Event();
auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer());
inverter.InvertMatrixDiagonalBlocks(Layout::kColMajor, triangle, diagonal,
k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer);
diagonal_invert_event.WaitForCompletion();
// Derives properties based on the arguments
const auto condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) ||
(triangle == Triangle::kLower && a_transpose == Transpose::kNo));
// Left side
if (side == Side::kLeft) {
// True when (lower triangular) or (upper triangular and transposed)
if (condition) {
for (auto i = size_t{0}; i < m; i += block_size) {
const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
const auto current_block_size = std::min(m - i, block_size);
DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
current_block_size, n, current_block_size, gemm_alpha,
a_inv_buffer, i * block_size, block_size,
b_buffer, i, b_ld, ConstantZero<T>(),
x_buffer, i, x_ld);
if (i + block_size >= m) { break; }
const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld;
DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
m - i - block_size, n, block_size, ConstantNegOne<T>(),
a_buffer, this_a_offset, a_ld,
x_buffer, i, x_ld, ConstantOne<T>(),
b_buffer, i + block_size, b_ld);
}
}
// True when (upper triangular) or (lower triangular and transposed)
else {
const auto current_block_size = (m % block_size == 0) ? block_size : (m % block_size);
const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size);
for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
current_block_size, n, current_block_size, gemm_alpha,
a_inv_buffer, i * block_size, block_size,
b_buffer, i, b_ld, ConstantZero<T>(),
x_buffer, i, x_ld);
if (i - static_cast<int>(block_size) < 0) { break; }
const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i;
DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
i, n, block_size, ConstantNegOne<T>(),
a_buffer, this_a_offset, a_ld,
x_buffer, i, x_ld, ConstantOne<T>(),
b_buffer, 0, b_ld);
}
}
}
// Right side
else {
// True when (lower triangular) or (upper triangular and transposed)
if (condition) {
const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size);
const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size);
for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
m, current_block_size, current_block_size, gemm_alpha,
b_buffer, i * b_ld, b_ld,
a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
x_buffer, i * x_ld, x_ld);
if (i - static_cast<int>(block_size) < 0) { break; }
const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
m, i, current_block_size, ConstantNegOne<T>(),
x_buffer, i * x_ld, x_ld,
a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
b_buffer, 0, b_ld);
}
}
// True when (upper triangular) or (lower triangular and transposed)
else {
for (auto i = size_t{0}; i < n; i += block_size) {
const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
const auto current_block_size = std::min(n - i, block_size);
DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
m, current_block_size, current_block_size, gemm_alpha,
b_buffer, i * b_ld, b_ld,
a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
x_buffer, i * x_ld, x_ld);
if (i + block_size >= n) { break; }
const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld;
DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
m, n - i - block_size, block_size, ConstantNegOne<T>(),
x_buffer, i * x_ld, x_ld,
a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
b_buffer, (i + block_size) * b_ld, b_ld);
}
}
}
// Retrieves the results
x_buffer.CopyTo(queue_, b_size, b_buffer);
}
// =================================================================================================
// Compiles the templated class
template class Xtrsm<half>;
template class Xtrsm<float>;
template class Xtrsm<double>;
template class Xtrsm<float2>;
template class Xtrsm<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,60 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xtrsm routine. The implementation is based on ??? (TODO).
// Therefore, this class inherits from the Xgemm class.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XTRSM_H_
#define CLBLAST_ROUTINES_XTRSM_H_
#include "routines/level3/xgemm.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xtrsm: public Xgemm<T> {
public:
// Uses methods and variables the Xgemm routine
using Xgemm<T>::queue_;
using Xgemm<T>::context_;
using Xgemm<T>::device_;
using Xgemm<T>::db_;
using Xgemm<T>::program_;
using Xgemm<T>::DoGemm;
// Constructor
Xtrsm(Queue &queue, EventPointer event, const std::string &name = "TRSM");
// Templated-precision implementation of the routine
void DoTrsm(const Layout layout, Side side, Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
size_t m, size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
// Implementation of the column-major version
void TrsmColMajor(const Side side, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XTRSM_H_
#endif

View file

@ -0,0 +1,151 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains all the common code to perform (partial) matrix inverting. This code is based
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
// and Jack Dongarra.
//
// =================================================================================================
#include "routines/levelx/xinvert.hpp"
#include <string>
#include <vector>
#include <assert.h>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/invert_diagonal_blocks.opencl"
}) {
}
// =================================================================================================
// Inverts diagonal square blocks of a matrix
template <typename T>
void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
const size_t n, const size_t block_size,
const Buffer<T> &src, const size_t offset, const size_t ld_src,
Buffer<T> &dest) {
// Makes sure all dimensions are larger than zero
if ((block_size == 0) || (n == 0)) {
throw BLASError(StatusCode::kInvalidDimension);
}
// Helper variables
const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
assert(internal_block_size == 16);
const auto num_blocks = CeilDiv(n, block_size);
const auto num_internal_blocks = CeilDiv(n, internal_block_size);
const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
// This routine only supports block sizes which are a multiple of the internal block size and
// block sizes up to and including 128
if ((block_size % internal_block_size != 0) || (block_size > 128)) {
throw BLASError(StatusCode::kUnknownError);
}
// Checks for validity of the source and destination matrices
TestMatrixA(n, n, src, offset, ld_src);
TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size);
// Determines which kernels to run based on the layout (the kernels assume column-major as
// default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
const auto name_postfix = (is_upper) ? "Upper" : "Lower";
// Fills the output buffer with zeros
auto event_wait_list = std::vector<Event>();
auto fill_matrix_event = Event();
FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list,
block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
event_wait_list.push_back(fill_matrix_event);
// Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
auto kernel = Kernel(program_, "InvertDiagonalBlock");
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, src());
kernel.SetArgument(2, static_cast<int>(offset));
kernel.SetArgument(3, static_cast<int>(ld_src));
kernel.SetArgument(4, dest());
kernel.SetArgument(5, static_cast<int>(block_size));
kernel.SetArgument(6, static_cast<int>(unit_diagonal));
kernel.SetArgument(7, static_cast<int>(is_upper));
const auto local = std::vector<size_t>{internal_block_size};
const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
auto base_kernel_event = Event();
auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
// Builds up block_size x block_size blocks. For example, internal_block_size=16:
// use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads;
// then 32 x 32 blocks to build 64 x 64 blocks, 1 x (2 x npages) grid, 8 x 4 threads;
// then 64 x 64 blocks to build 128 x 128 blocks, 1 x (4 x npages) grid, 16 x 4 threads;
for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) {
assert(current_size == 16 || current_size == 32 || current_size == 64);
// Emulates a 3D grid: NX * (NY * npages)
const auto npages = CeilDiv(n, current_size*2);
const auto local0 = (current_size <= 32) ? current_size/4 : 16;
const auto local = std::vector<size_t>{local0, 4};
const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
// Part 1
auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, src());
kernel1.SetArgument(2, static_cast<int>(offset));
kernel1.SetArgument(3, static_cast<int>(ld_src));
kernel1.SetArgument(4, dest());
kernel1.SetArgument(5, static_cast<int>(current_size));
kernel1.SetArgument(6, static_cast<int>(npages));
kernel1.SetArgument(7, static_cast<int>(block_size));
auto kernel1_event = Event();
RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list);
event_wait_list.push_back(kernel1_event);
// Part 2
const bool is_last_kernel = (current_size * 2 >= block_size);
auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
kernel2.SetArgument(0, static_cast<int>(n));
kernel2.SetArgument(1, dest());
kernel2.SetArgument(2, static_cast<int>(current_size));
kernel2.SetArgument(3, static_cast<int>(npages));
kernel2.SetArgument(4, static_cast<int>(block_size));
auto kernel2_event = Event();
auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list);
if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
// Exit in case we reach beyond the bounds of the input matrix
if (current_size*2 >= n) { break; }
}
}
// =================================================================================================
// Compiles the templated class
template class Xinvert<half>;
template class Xinvert<float>;
template class Xinvert<double>;
template class Xinvert<float2>;
template class Xinvert<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,40 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains all the common code to perform (partial) matrix inverting.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XINVERT_H_
#define CLBLAST_ROUTINES_XINVERT_H_
#include "routine.hpp"
namespace clblast {
// =================================================================================================
template <typename T>
class Xinvert: public Routine {
public:
// Constructor
Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT");
// Inverts diagonal square blocks of a matrix
void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
const size_t n, const size_t block_size,
const Buffer<T> &src, const size_t offset, const size_t ld_src,
Buffer<T> &dest);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XINVERT_H_
#endif

View file

@ -18,57 +18,74 @@
#include <chrono>
#include <random>
#include <iomanip>
#include <cmath>
namespace clblast {
// =================================================================================================
// Returns a scalar with a default value
template <typename T>
T GetScalar() {
return static_cast<T>(2.0);
}
template <typename T> T GetScalar() { return static_cast<T>(2.0); }
template float GetScalar<float>();
template double GetScalar<double>();
template <> half GetScalar() { return FloatToHalf(2.0f); }
template <> float2 GetScalar() { return {2.0f, 0.5f}; }
template <> double2 GetScalar() { return {2.0, 0.5}; }
// Specialized version of the above for half-precision
template <>
half GetScalar() {
return FloatToHalf(2.0f);
}
// Specialized versions of the above for complex data-types
template <>
float2 GetScalar() {
return {2.0f, 0.5f};
}
template <>
double2 GetScalar() {
return {2.0, 0.5};
}
// Returns a scalar of value 0
template <typename T> T ConstantZero() { return static_cast<T>(0.0); }
template float ConstantZero<float>();
template double ConstantZero<double>();
template <> half ConstantZero() { return FloatToHalf(0.0f); }
template <> float2 ConstantZero() { return {0.0f, 0.0f}; }
template <> double2 ConstantZero() { return {0.0, 0.0}; }
// Returns a scalar of value 1
template <typename T>
T ConstantOne() {
return static_cast<T>(1.0);
}
template <typename T> T ConstantOne() { return static_cast<T>(1.0); }
template float ConstantOne<float>();
template double ConstantOne<double>();
template <> half ConstantOne() { return FloatToHalf(1.0f); }
template <> float2 ConstantOne() { return {1.0f, 0.0f}; }
template <> double2 ConstantOne() { return {1.0, 0.0}; }
// Specialized version of the above for half-precision
template <>
half ConstantOne() {
return FloatToHalf(1.0f);
}
// Returns a scalar of value -1
template <typename T> T ConstantNegOne() { return static_cast<T>(-1.0); }
template float ConstantNegOne<float>();
template double ConstantNegOne<double>();
template <> half ConstantNegOne() { return FloatToHalf(-1.0f); }
template <> float2 ConstantNegOne() { return {-1.0f, 0.0f}; }
template <> double2 ConstantNegOne() { return {-1.0, 0.0}; }
// Specialized versions of the above for complex data-types
template <>
float2 ConstantOne() {
return {1.0f, 0.0f};
}
template <>
double2 ConstantOne() {
return {1.0, 0.0};
}
// Returns a scalar of value 1
template <typename T> T ConstantTwo() { return static_cast<T>(2.0); }
template float ConstantTwo<float>();
template double ConstantTwo<double>();
template <> half ConstantTwo() { return FloatToHalf(2.0f); }
template <> float2 ConstantTwo() { return {2.0f, 0.0f}; }
template <> double2 ConstantTwo() { return {2.0, 0.0}; }
// Returns a small scalar value just larger than 0
template <typename T> T SmallConstant() { return static_cast<T>(1e7); }
template float SmallConstant<float>();
template double SmallConstant<double>();
template <> half SmallConstant() { return FloatToHalf(1e7); }
template <> float2 SmallConstant() { return {1e7, 0.0f}; }
template <> double2 SmallConstant() { return {1e7, 0.0}; }
// Returns the absolute value of a scalar
template <typename T> T AbsoluteValue(const T value) { return std::fabs(value); }
template float AbsoluteValue<float>(const float);
template double AbsoluteValue<double>(const double);
template <> half AbsoluteValue(const half value) { return FloatToHalf(std::fabs(HalfToFloat(value))); }
template <> float2 AbsoluteValue(const float2 value) { return std::abs(value); }
template <> double2 AbsoluteValue(const double2 value) { return std::abs(value); }
// Returns whether a scalar is close to zero
template <typename T> bool IsCloseToZero(const T value) { return (value > -SmallConstant<T>()) && (value < SmallConstant<T>()); }
template bool IsCloseToZero<float>(const float);
template bool IsCloseToZero<double>(const double);
template <> bool IsCloseToZero(const half value) { return IsCloseToZero(HalfToFloat(value)); }
template <> bool IsCloseToZero(const float2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); }
// =================================================================================================
@ -79,23 +96,27 @@ std::string ToString(T value) {
}
template std::string ToString<int>(int value);
template std::string ToString<size_t>(size_t value);
template std::string ToString<float>(float value);
template std::string ToString<double>(double value);
template <>
std::string ToString(float value) {
std::ostringstream result;
result << std::fixed << std::setprecision(2) << value;
return result.str();
}
template <>
std::string ToString(double value) {
std::ostringstream result;
result << std::fixed << std::setprecision(2) << value;
return result.str();
}
// If not possible directly: special cases for complex data-types
template <>
std::string ToString(float2 value) {
std::ostringstream real, imag;
real << std::setprecision(2) << value.real();
imag << std::setprecision(2) << value.imag();
return real.str()+"+"+imag.str()+"i";
return ToString(value.real())+"+"+ToString(value.imag())+"i";
}
template <>
std::string ToString(double2 value) {
std::ostringstream real, imag;
real << std::setprecision(2) << value.real();
imag << std::setprecision(2) << value.imag();
return real.str()+"+"+imag.str()+"i";
return ToString(value.real())+"+"+ToString(value.imag())+"i";
}
// If not possible directly: special case for half-precision

View file

@ -99,12 +99,20 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
// =================================================================================================
// Returns a scalar with a default value
template <typename T>
T GetScalar();
template <typename T> T GetScalar();
// Returns a scalar of value 1
template <typename T>
T ConstantOne();
// Fixed value scalars
template <typename T> T ConstantZero();
template <typename T> T ConstantOne();
template <typename T> T ConstantNegOne();
template <typename T> T ConstantTwo();
template <typename T> T SmallConstant();
// Returns the absolute value of a scalar
template <typename T> T AbsoluteValue(const T value);
// Returns whether a scalar is close to zero
template <typename T> bool IsCloseToZero(const T value);
// =================================================================================================

View file

@ -23,7 +23,6 @@ int main(int argc, char *argv[]) {
errors += clblast::RunTests<clblast::TestXtrsm<double>, double, double>(argc, argv, true, "DTRSM");
errors += clblast::RunTests<clblast::TestXtrsm<float2>, float2, float2>(argc, argv, true, "CTRSM");
errors += clblast::RunTests<clblast::TestXtrsm<double2>, double2, double2>(argc, argv, true, "ZTRSM");
errors += clblast::RunTests<clblast::TestXtrsm<half>, half, half>(argc, argv, true, "HTRSM");
if (errors > 0) { return 1; } else { return 0; }
}

View file

@ -0,0 +1,30 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "test/correctness/testblas.hpp"
#include "test/routines/levelx/xinvert.hpp"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
auto errors = size_t{0};
errors += clblast::RunTests<clblast::TestXinvert<float>, float, float>(argc, argv, false, "SINVERT");
errors += clblast::RunTests<clblast::TestXinvert<double>, double, double>(argc, argv, true, "DINVERT");
errors += clblast::RunTests<clblast::TestXinvert<float2>, float2, float2>(argc, argv, true, "CINVERT");
errors += clblast::RunTests<clblast::TestXinvert<double2>, double2, double2>(argc, argv, true, "ZINVERT");
errors += clblast::RunTests<clblast::TestXinvert<half>, half, half>(argc, argv, true, "HINVERT");
if (errors > 0) { return 1; } else { return 0; }
}
// =================================================================================================

View file

@ -138,7 +138,10 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
// Don't continue with CBLAS if there are incorrect parameters
if (compare_cblas_ && status2 != StatusCode::kSuccess) {
if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
if (verbose_) {
fprintf(stdout, " -> %d -> ", static_cast<int>(status2));
std::cout << std::flush;
}
TestErrorCodes(status2, status2, args);
continue;
}

View file

@ -40,6 +40,12 @@ float getAbsoluteErrorMargin<half>() {
return 0.10f; // especially small values are inaccurate for half-precision
}
// Error margin: numbers beyond this value are considered equal to inf or NaN
template <typename T>
T getAlmostInfNumber() {
return T{1e35}; // used for correctness testing of TRSV and TRSM routines
}
// Maximum number of test results printed on a single line
template <typename T, typename U> const size_t Tester<T,U>::kResultsPerLine = size_t{64};
@ -426,6 +432,21 @@ bool TestSimilarityNear(const T val1, const T val2,
if (val1 == val2) {
return true;
}
// Handles cases with both results NaN or inf
else if ((std::isnan(val1) && std::isnan(val2)) || (std::isinf(val1) && std::isinf(val2))) {
return true;
}
// Also considers it OK if one of the results in NaN and the other is inf
// Note: for TRSV and TRSM routines
else if ((std::isnan(val1) && std::isinf(val2)) || (std::isinf(val1) && std::isnan(val2))) {
return true;
}
// Also considers it OK if one of the values is super large and the other is inf or NaN
// Note: for TRSV and TRSM routines
else if ((std::abs(val1) > getAlmostInfNumber<T>() && (std::isinf(val2) || std::isnan(val2))) ||
(std::abs(val2) > getAlmostInfNumber<T>() && (std::isinf(val1) || std::isnan(val1)))) {
return true;
}
// The values are zero or very small: the relative error is less meaningful
else if (val1 == 0 || val2 == 0 || difference < error_margin_absolute) {
return (difference < error_margin_absolute);
@ -452,15 +473,21 @@ template bool TestSimilarity<double>(const double, const double);
// Specialisations for non-standard data-types
template <>
bool TestSimilarity(const float2 val1, const float2 val2) {
auto real = TestSimilarity(val1.real(), val2.real());
auto imag = TestSimilarity(val1.imag(), val2.imag());
return (real && imag);
const auto real = TestSimilarity(val1.real(), val2.real());
const auto imag = TestSimilarity(val1.imag(), val2.imag());
if (real && imag) { return true; }
// also OK if one is good and the combined is good (indicates a big diff between real & imag)
if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); }
return false; // neither real nor imag is good, return false
}
template <>
bool TestSimilarity(const double2 val1, const double2 val2) {
auto real = TestSimilarity(val1.real(), val2.real());
auto imag = TestSimilarity(val1.imag(), val2.imag());
return (real && imag);
const auto real = TestSimilarity(val1.real(), val2.real());
const auto imag = TestSimilarity(val1.imag(), val2.imag());
if (real && imag) { return true; }
// also OK if one is good and the combined is good (indicates a big diff between real & imag)
if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); }
return false; // neither real nor imag is good, return false
}
template <>
bool TestSimilarity(const half val1, const half val2) {

View file

@ -20,8 +20,7 @@ using double2 = clblast::double2;
int main(int argc, char *argv[]) {
const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf:
clblast::RunClient<clblast::TestXtrsm<half>, half, half>(argc, argv); break;
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXtrsm<float>, float, float>(argc, argv); break;
case clblast::Precision::kDouble:

View file

@ -0,0 +1,37 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "test/performance/client.hpp"
#include "test/routines/levelx/xinvert.hpp"
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf:
clblast::RunClient<clblast::TestXinvert<half>, half, half>(argc, argv); break;
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXinvert<float>, float, float>(argc, argv); break;
case clblast::Precision::kDouble:
clblast::RunClient<clblast::TestXinvert<double>, double, double>(argc, argv); break;
case clblast::Precision::kComplexSingle:
clblast::RunClient<clblast::TestXinvert<float2>, float2, float2>(argc, argv); break;
case clblast::Precision::kComplexDouble:
clblast::RunClient<clblast::TestXinvert<double2>, double2, double2>(argc, argv); break;
}
return 0;
}
// =================================================================================================

View file

@ -0,0 +1,184 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements a class with static methods to describe the Xtrsv routine. Examples of
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
// static methods are used by the correctness tester and the performance tester.
//
// =================================================================================================
#ifndef CLBLAST_TEST_ROUTINES_XTRSV_H_
#define CLBLAST_TEST_ROUTINES_XTRSV_H_
#include <vector>
#include <string>
#ifdef CLBLAST_REF_CLBLAS
#include "test/wrapper_clblas.hpp"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "test/wrapper_cblas.hpp"
#endif
namespace clblast {
// =================================================================================================
// Prepares the data
template <typename T>
void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
if (args.a_ld < args.n) { return; }
if (args.a_size <= 0 || args.x_size <= 0) { return; }
// Copies input buffers to the host
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
// Generates 'proper' input for the TRSV routine
// TODO: Improve this, currently loosely based on clBLAS's implementation
for (auto i = size_t{0}; i < args.n; ++i) {
auto diagonal = a_mat_cpu[i*args.a_ld + i + args.a_offset];
diagonal = AbsoluteValue(diagonal) + static_cast<T>(args.n / size_t{4});
for (auto j = size_t{0}; j < args.n; ++j) {
a_mat_cpu[j*args.a_ld + i + args.a_offset] /= ConstantTwo<T>();
}
a_mat_cpu[i*args.a_ld + i + args.a_offset] = diagonal;
x_vec_cpu[i * args.x_inc + args.x_offset] /= ConstantTwo<T>();
}
// Copies input buffers back to the OpenCL device
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
return;
}
// See comment at top of file for a description of the class
template <typename T>
class TestXtrsv {
public:
// The BLAS level: 1, 2, or 3
static size_t BLASLevel() { return 2; }
// The list of arguments relevant for this routine
static std::vector<std::string> GetOptions() {
return {kArgN,
kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal,
kArgALeadDim, kArgXInc,
kArgAOffset, kArgXOffset};
}
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) {
return args.n * args.x_inc + args.x_offset;
}
static size_t GetSizeA(const Arguments<T> &args) {
return args.n * args.a_ld + args.a_offset;
}
// Describes how to set the sizes of all the buffers
static void SetSizes(Arguments<T> &args) {
args.a_size = GetSizeA(args);
args.x_size = GetSizeX(args);
}
// Describes what the default values of the leading dimensions of the matrices are
static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
// Describes which transpose options are relevant for this routine
using Transposes = std::vector<Transpose>;
static Transposes GetATransposes(const Transposes &all) { return all; }
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
PrepareData(args, buffers, queue);
auto queue_plain = queue();
auto event = cl_event{};
auto status = Trsv<T>(args.layout, args.triangle, args.a_transpose, args.diagonal,
args.n,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc,
&queue_plain, &event);
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
PrepareData(args, buffers, queue);
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXtrsv<T>(convertToCLBLAS(args.layout),
convertToCLBLAS(args.triangle),
convertToCLBLAS(args.a_transpose),
convertToCLBLAS(args.diagonal),
args.n,
buffers.a_mat, args.a_offset, args.a_ld,
buffers.x_vec, args.x_offset, args.x_inc,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
PrepareData(args, buffers, queue);
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.x_vec.Read(queue, args.x_size, x_vec_cpu);
cblasXtrsv(convertToCBLAS(args.layout),
convertToCBLAS(args.triangle),
convertToCBLAS(args.a_transpose),
convertToCBLAS(args.diagonal),
args.n,
a_mat_cpu, args.a_offset, args.a_ld,
x_vec_cpu, args.x_offset, args.x_inc);
buffers.x_vec.Write(queue, args.x_size, x_vec_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> result(args.x_size, static_cast<T>(0));
buffers.x_vec.Read(queue, args.x_size, result);
return result;
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &args) {
return args.n;
}
static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
return id1*args.x_inc + args.x_offset;
}
// Describes how to compute performance metrics
static size_t GetFlops(const Arguments<T> &args) {
return 2 * args.n * args.n;
}
static size_t GetBytes(const Arguments<T> &args) {
return (args.n*args.n + 2*args.n + args.n) * sizeof(T);
}
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_ROUTINES_XTRSV_H_
#endif

View file

@ -0,0 +1,194 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements a class with static methods to describe the Xtrsm routine. Examples of
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
// static methods are used by the correctness tester and the performance tester.
//
// =================================================================================================
#ifndef CLBLAST_TEST_ROUTINES_XTRSM_H_
#define CLBLAST_TEST_ROUTINES_XTRSM_H_
#include <vector>
#include <string>
#ifdef CLBLAST_REF_CLBLAS
#include "test/wrapper_clblas.hpp"
#endif
#ifdef CLBLAST_REF_CBLAS
#include "test/wrapper_cblas.hpp"
#endif
namespace clblast {
// =================================================================================================
// Prepares the data
template <typename T>
void PrepareData(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
const auto k = (args.side == Side::kLeft) ? args.m : args.n;
if (args.a_ld < k) { return; }
if (args.a_size <= 0 || args.b_size <= 0) { return; }
// Copies input buffers to the host
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
// Generates 'proper' input for the TRSM routine
// TODO: Improve this
for (auto i = size_t{0}; i < k; ++i) {
for (auto j = size_t{0}; j < k; ++j) {
auto value = a_mat_cpu[j*args.a_ld + i + args.a_offset];
value *= ConstantTwo<T>();
if (IsCloseToZero(value)) { value += ConstantOne<T>(); }
a_mat_cpu[j*args.a_ld + i + args.a_offset] = value;
}
}
// Copies input buffers back to the OpenCL device
buffers.a_mat.Write(queue, args.a_size, a_mat_cpu);
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
return;
}
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestXtrsm {
public:
// The BLAS level: 1, 2, or 3
static size_t BLASLevel() { return 3; }
// The list of arguments relevant for this routine
static std::vector<std::string> GetOptions() {
return {kArgM, kArgN,
kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal,
kArgALeadDim, kArgBLeadDim,
kArgAOffset, kArgBOffset,
kArgAlpha};
}
// Describes how to obtain the sizes of the buffers
static size_t GetSizeA(const Arguments<T> &args) {
const auto k = (args.side == Side::kLeft) ? args.m : args.n;
return k * args.a_ld + args.a_offset;
}
static size_t GetSizeB(const Arguments<T> &args) {
const auto b_rotated = (args.layout == Layout::kRowMajor);
const auto b_two = (b_rotated) ? args.m : args.n;
return b_two * args.b_ld + args.b_offset;
}
// Describes how to set the sizes of all the buffers
static void SetSizes(Arguments<T> &args) {
args.a_size = GetSizeA(args);
args.b_size = GetSizeB(args);
}
// Describes what the default values of the leading dimensions of the matrices are
static size_t DefaultLDA(const Arguments<T> &args) { return args.m; }
static size_t DefaultLDB(const Arguments<T> &args) { return args.n; }
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
// Describes which transpose options are relevant for this routine
using Transposes = std::vector<Transpose>;
static Transposes GetATransposes(const Transposes &all) { return all; }
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
PrepareData(args, buffers, queue);
auto queue_plain = queue();
auto event = cl_event{};
auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal,
args.m, args.n, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.b_mat(), args.b_offset, args.b_ld,
&queue_plain, &event);
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
return status;
}
// Describes how to run the clBLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CLBLAS
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
PrepareData(args, buffers, queue);
auto queue_plain = queue();
auto event = cl_event{};
auto status = clblasXtrsm(convertToCLBLAS(args.layout),
convertToCLBLAS(args.side),
convertToCLBLAS(args.triangle),
convertToCLBLAS(args.a_transpose),
convertToCLBLAS(args.diagonal),
args.m, args.n, args.alpha,
buffers.a_mat, args.a_offset, args.a_ld,
buffers.b_mat, args.b_offset, args.b_ld,
1, &queue_plain, 0, nullptr, &event);
clWaitForEvents(1, &event);
return static_cast<StatusCode>(status);
}
#endif
// Describes how to run the CPU BLAS routine (for correctness/performance comparison)
#ifdef CLBLAST_REF_CBLAS
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
PrepareData(args, buffers, queue);
std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
cblasXtrsm(convertToCBLAS(args.layout),
convertToCBLAS(args.side),
convertToCBLAS(args.triangle),
convertToCBLAS(args.a_transpose),
convertToCBLAS(args.diagonal),
args.m, args.n, args.alpha,
a_mat_cpu, args.a_offset, args.a_ld,
b_mat_cpu, args.b_offset, args.b_ld);
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
return StatusCode::kSuccess;
}
#endif
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> result(args.b_size, static_cast<T>(0));
buffers.b_mat.Read(queue, args.b_size, result);
return result;
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &args) { return args.m; }
static size_t ResultID2(const Arguments<T> &args) { return args.n; }
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
return (args.layout == Layout::kRowMajor) ?
id1*args.b_ld + id2 + args.b_offset:
id2*args.b_ld + id1 + args.b_offset;
}
// Describes how to compute performance metrics
static size_t GetFlops(const Arguments<T> &args) {
auto k = (args.side == Side::kLeft) ? args.m : args.n;
return args.m * args.n * k;
}
static size_t GetBytes(const Arguments<T> &args) {
auto k = (args.side == Side::kLeft) ? args.m : args.n;
return (k*k + 2*args.m*args.n) * sizeof(T);
}
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_ROUTINES_XTRSM_H_
#endif

View file

@ -0,0 +1,224 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements a class with static methods to describe the Xinvert routine. Examples of
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
// static methods are used by the correctness tester and the performance tester.
//
// =================================================================================================
#ifndef CLBLAST_TEST_ROUTINES_XINVERT_H_
#define CLBLAST_TEST_ROUTINES_XINVERT_H_
#include <vector>
#include <string>
#include "routines/levelx/xinvert.hpp"
namespace clblast {
// =================================================================================================
template <typename T>
StatusCode RunReference(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) ||
(args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor));
// Data transfer from OpenCL to std::vector
std::vector<T> a_mat_cpu(args.a_size, T{0.0});
buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
// Creates the output buffer
std::vector<T> b_mat_cpu(args.b_size, T{0.0});
// Helper variables
const auto block_size = args.m;
const auto num_blocks = CeilDiv(args.n, block_size);
const auto a_ld = args.a_ld;
const auto b_ld = block_size;
// Checks for valid arguments
if ((block_size == 0) || (args.n == 0)) {
return StatusCode::kInvalidDimension;
}
if ((block_size % 16 != 0) || (block_size > 128)) {
return StatusCode::kUnknownError;
}
// Loops over the amount of diagonal blocks of size args.m by args.m each
for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) {
const auto a_offset = block_id * (block_size + a_ld * block_size) + args.a_offset;
const auto b_offset = block_id * block_size * block_size;
// Inverts the diagonal elements of the matrix
for (auto i = size_t{0}; i < block_size; ++i) {
auto a_value = T{1.0};
if (args.diagonal == Diagonal::kNonUnit) {
if (i + block_id * block_size < args.n) {
if (a_mat_cpu[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; }
a_value = T{1.0} / a_mat_cpu[i * a_ld + i + a_offset];
}
}
b_mat_cpu[i * b_ld + i + b_offset] = a_value;
}
// Inverts the upper triangle row by row
if (is_upper) {
for (int i = static_cast<int>(block_size) - 2; i >= 0; --i) {
for (auto j = static_cast<int>(block_size) - 1; j > i; --j) {
auto sum = T{0.0};
for (auto k = i + 1; k <= j; ++k) {
auto a_value = T{0.0};
if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
a_value = a_mat_cpu[k * a_ld + i + a_offset];
}
sum += a_value * b_mat_cpu[j * b_ld + k + b_offset];
}
b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset];
}
}
}
// Inverts the lower triangle row by row
else {
for (auto i = size_t{1}; i < block_size; ++i) {
for (auto j = size_t{0}; j < i; ++j) {
auto sum = T{0.0};
for (auto k = j; k < i; ++k) {
auto a_value = T{0.0};
if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) {
a_value = a_mat_cpu[k * a_ld + i + a_offset];
}
sum += a_value * b_mat_cpu[j * b_ld + k + b_offset];
}
b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset];
}
}
}
}
// Data transfer back to OpenCL
buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
return StatusCode::kSuccess;
}
// Half-precision version calling the above reference implementation after conversions
template <>
StatusCode RunReference<half>(const Arguments<half> &args, Buffers<half> &buffers, Queue &queue) {
auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue());
auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue());
auto dummy = clblast::Buffer<float>(0);
auto buffers2 = Buffers<float>{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy};
auto args2 = Arguments<float>();
args2.a_size = args.a_size; args2.b_size = args.b_size;
args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n;
args2.a_offset = args.a_offset;
args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal;
auto status = RunReference(args2, buffers2, queue);
FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue());
return status;
}
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestXinvert {
public:
// The BLAS level: 4 for the extra routines
static size_t BLASLevel() { return 4; }
// The list of arguments relevant for this routine
static std::vector<std::string> GetOptions() {
return {kArgN, kArgM,
kArgLayout, kArgTriangle, kArgDiagonal,
kArgALeadDim, kArgAOffset};
}
// Describes how to obtain the sizes of the buffers
static size_t GetSizeA(const Arguments<T> &args) {
return args.n * args.a_ld + args.a_offset;
}
static size_t GetSizeB(const Arguments<T> &args) {
const auto block_size = args.m;
const auto num_blocks = CeilDiv(args.n, block_size);
return num_blocks * block_size * block_size;
}
// Describes how to set the sizes of all the buffers
static void SetSizes(Arguments<T> &args) {
args.a_size = GetSizeA(args);
args.b_size = GetSizeB(args);
}
// Describes what the default values of the leading dimensions of the matrices are
static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
// Describes which omatcopyose options are relevant for this routine
using Transposes = std::vector<Transpose>;
static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
try {
auto event = cl_event{};
auto inverter = Xinvert<T>(queue, &event);
inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal,
args.n, args.m,
buffers.a_mat, args.a_offset, args.a_ld,
buffers.b_mat);
clWaitForEvents(1, &event);
clReleaseEvent(event);
} catch (...) { return DispatchException(); }
return StatusCode::kSuccess;
}
// Describes how to run a naive version of the routine (for correctness/performance comparison).
// Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
return RunReference(args, buffers, queue);
}
static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
return RunReference(args, buffers, queue);
}
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> result(args.b_size, static_cast<T>(0));
buffers.b_mat.Read(queue, args.b_size, result);
return result;
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &args) { return args.m; }
static size_t ResultID2(const Arguments<T> &args) { return Ceil(args.n, args.m); }
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
return id1 * Ceil(args.n, args.m) + id2;
}
// Describes how to compute performance metrics
static size_t GetFlops(const Arguments<T> &args) {
const auto block_size = args.m;
const auto num_blocks = CeilDiv(args.n, block_size);
return num_blocks * (block_size * (block_size / 2) * (block_size / 2));
}
static size_t GetBytes(const Arguments<T> &args) {
return (args.a_size * args.b_size) * sizeof(T);
}
};
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_ROUTINES_XINVERT_H_
#endif

View file

@ -2103,20 +2103,6 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
}
void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
const size_t m, const size_t n,
const half alpha,
const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld) {
auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
auto b_buffer_bis = HalfToFloatBuffer(b_buffer);
cblasXtrsm(layout, side, triangle, a_transpose, diagonal,
m, n,
HalfToFloat(alpha),
a_buffer_bis, a_offset, a_ld,
b_buffer_bis, b_offset, b_ld);
FloatToHalfBuffer(b_buffer, b_buffer_bis);
}
// =================================================================================================
} // namespace clblast

View file

@ -2865,24 +2865,6 @@ clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const
b_buffer(), b_offset, b_ld,
num_queues, queues, num_wait_events, wait_events, events);
}
clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
const size_t m, const size_t n,
const half alpha,
const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld,
cl_uint num_queues, cl_command_queue *queues,
cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]);
auto status = clblasXtrsm(layout, side, triangle, a_transpose, diagonal,
m, n,
HalfToFloat(alpha),
a_buffer_bis, a_offset, a_ld,
b_buffer_bis, b_offset, b_ld,
num_queues, queues, num_wait_events, wait_events, events);
FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]);
return status;
}
// =================================================================================================
} // namespace clblast