diff --git a/CHANGELOG b/CHANGELOG index 80551611..38bbaa07 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -8,6 +8,10 @@ Development version (next release) - Added the OverrideParameters function to the API to be able to supply custom tuning parmeters - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) +- Added level-2 routines: + * STRSV/DTRSV/CTRSV/ZTRSV (experimental, un-optimized) +- Added level-3 routines: + * STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized) Version 0.10.0 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header diff --git a/CMakeLists.txt b/CMakeLists.txt index 6edc70da..bf905bc8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,9 +156,9 @@ if(NETLIB) set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) endif() set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) -set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv +set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) -set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm) +set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm) set(LEVELX_ROUTINES xomatcopy) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(PRECISIONS 32 64 3232 6464 16) @@ -175,6 +175,7 @@ set(SOURCES src/clblast.cpp src/clblast_c.cpp src/routine.cpp + src/routines/levelx/xinvert.cpp # only source, don't include it as a test ) if(NETLIB) set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp) diff --git a/README.md b/README.md index ceebc588..34cc1127 100644 --- a/README.md +++ b/README.md @@ -258,6 +258,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non- | xSPR | ✔ | ✔ | - | - | ✔ | | xSYR2 | ✔ | ✔ | - | - | ✔ | | xSPR2 | ✔ | ✔ | - | - | ✔ | +| xTRSV | ✔ | ✔ | ✔ | ✔ | | (experimental, un-optimized) | Level-3 | S | D | C | Z | H | | ---------|---|---|---|---|---| @@ -269,6 +270,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non- | xSYR2K | ✔ | ✔ | ✔ | ✔ | ✔ | | xHER2K | - | - | ✔ | ✔ | - | | xTRMM | ✔ | ✔ | ✔ | ✔ | ✔ | +| xTRSM | ✔ | ✔ | ✔ | ✔ | | (experimental, un-optimized) In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care: @@ -279,7 +281,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif | IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ | | xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ | -Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTRSV, xTBSV, xTPSV, and xTRSM. +Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV. Half precision (fp16) diff --git a/doc/clblast.md b/doc/clblast.md index 79ff5eb2..1d7c0df2 100644 --- a/doc/clblast.md +++ b/doc/clblast.md @@ -1445,6 +1445,63 @@ Arguments to TPMV: +xTRSV: Solves a triangular system of equations +------------- + + + +C++ API: +``` +template +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to TRSV: + +* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal. +* `const size_t n`: Integer size argument. This value must be positive. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. +* `cl_mem x_buffer`: OpenCL buffer to store the output x vector. +* `const size_t x_offset`: The offset in elements from the start of the output x vector. +* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + xGER: General rank-1 matrix update ------------- @@ -2708,6 +2765,71 @@ Requirements for TRMM: +xTRSM: Solves a triangular system of equations +------------- + +Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_. + +C++ API: +``` +template +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const size_t m, const size_t n, + const float alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const size_t m, const size_t n, + const double alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const size_t m, const size_t n, + const cl_float2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const size_t m, const size_t n, + const cl_double2 alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to TRSM: + +* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout. +* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142). +* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122). +* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose. +* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal. +* `const size_t m`: Integer size argument. This value must be positive. +* `const size_t n`: Integer size argument. This value must be positive. +* `const T alpha`: Input scalar constant. +* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix. +* `const size_t a_offset`: The offset in elements from the start of the input A matrix. +* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0. +* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix. +* `const size_t b_offset`: The offset in elements from the start of the output B matrix. +* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function) ------------- diff --git a/include/clblast.h b/include/clblast.h index 9fdd5df1..020f8e79 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -587,7 +587,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, diff --git a/include/clblast_c.h b/include/clblast_c.h index d4b0b004..12d03f81 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -1265,7 +1265,7 @@ CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBl cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const size_t m, const size_t n, const float alpha, @@ -1290,12 +1290,6 @@ CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBl const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); -CLBlastStatusCode PUBLIC_API CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event); // ================================================================================================= // Extra non-BLAS routines (level-X) diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h index b5577cfa..384fab20 100644 --- a/include/clblast_netlib_c.h +++ b/include/clblast_netlib_c.h @@ -862,7 +862,7 @@ void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const void* a, const int a_ld, void* b, const int b_ld); -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 01231a6d..09c743bb 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,8 +41,8 @@ FILES = [ "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", ] -HEADER_LINES = [121, 73, 125, 23, 29, 41, 65, 32] -FOOTER_LINES = [25, 139, 27, 38, 6, 6, 9, 2] +HEADER_LINES = [121, 75, 125, 23, 29, 41, 65, 32] +FOOTER_LINES = [25, 138, 27, 38, 6, 6, 9, 2] HEADER_LINES_DOC = 0 FOOTER_LINES_DOC = 63 @@ -131,7 +131,7 @@ ROUTINES = [ Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []), + Routine(True, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []), Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update @@ -156,7 +156,7 @@ ROUTINES = [ Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), - Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "", []), + Routine(True, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []), ], [ # Level X: extra routines (not part of BLAS) Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), diff --git a/src/clblast.cpp b/src/clblast.cpp index a8e4d084..a63d766c 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -45,6 +45,7 @@ #include "routines/level2/xtrmv.hpp" #include "routines/level2/xtbmv.hpp" #include "routines/level2/xtpmv.hpp" +#include "routines/level2/xtrsv.hpp" #include "routines/level2/xger.hpp" #include "routines/level2/xgeru.hpp" #include "routines/level2/xgerc.hpp" @@ -66,6 +67,7 @@ #include "routines/level3/xsyr2k.hpp" #include "routines/level3/xher2k.hpp" #include "routines/level3/xtrmm.hpp" +#include "routines/level3/xtrsm.hpp" // Level-x includes (non-BLAS) #include "routines/levelx/xomatcopy.hpp" @@ -1145,12 +1147,20 @@ template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Tr // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template -StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal, - const size_t, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrsv(queue_cpp, event); + routine.DoTrsv(layout, triangle, a_transpose, diagonal, + n, + Buffer(a_buffer), a_offset, a_ld, + Buffer(x_buffer), x_offset, x_inc); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, @@ -2065,15 +2075,24 @@ template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triang cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template -StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const T, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*) { - return StatusCode::kNotImplemented; +StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + cl_command_queue* queue, cl_event* event) { + try { + auto queue_cpp = Queue(*queue); + auto routine = Xtrsm(queue_cpp, event); + routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + Buffer(a_buffer), a_offset, a_ld, + Buffer(b_buffer), b_offset, b_ld); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, @@ -2099,12 +2118,6 @@ template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Tri const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, - const size_t, const size_t, - const half, - const cl_mem, const size_t, const size_t, - cl_mem, const size_t, const size_t, - cl_command_queue*, cl_event*); // ================================================================================================= // Extra non-BLAS routines (level-X) @@ -2159,7 +2172,6 @@ template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); - // ================================================================================================= // Clears the cache of stored binaries diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index de431fa4..6018bcfa 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -3350,27 +3350,6 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid ); } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } -CLBlastStatusCode CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const size_t m, const size_t n, - const cl_half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_command_queue* queue, cl_event* event) { - try { - return static_cast( - clblast::Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, - queue, event) - ); - } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } -} // ================================================================================================= // Extra non-BLAS routines (level-X) diff --git a/src/database/database.cpp b/src/database/database.cpp index 02d0b139..f1d1dc66 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -20,12 +20,14 @@ #include "database/kernels/xgemv_fast.hpp" #include "database/kernels/xgemv_fast_rot.hpp" #include "database/kernels/xger.hpp" +#include "database/kernels/xtrsv.hpp" #include "database/kernels/xgemm.hpp" #include "database/kernels/xgemm_direct.hpp" #include "database/kernels/copy.hpp" #include "database/kernels/pad.hpp" #include "database/kernels/transpose.hpp" #include "database/kernels/padtranspose.hpp" +#include "database/kernels/invert.hpp" #include "database/kernel_selection.hpp" namespace clblast { @@ -39,12 +41,14 @@ const std::vector Database::database = { &database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble, &database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble, &database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble, + &database::XtrsvHalf, &database::XtrsvSingle, &database::XtrsvDouble, &database::XtrsvComplexSingle, &database::XtrsvComplexDouble, &database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble, &database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble, &database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble, &database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble, &database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble, &database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble, + &database::InvertHalf, &database::InvertSingle, &database::InvertDouble, &database::InvertComplexSingle, &database::InvertComplexDouble, &database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble }; diff --git a/src/database/kernels/invert.hpp b/src/database/kernels/invert.hpp new file mode 100644 index 00000000..2717f182 --- /dev/null +++ b/src/database/kernels/invert.hpp @@ -0,0 +1,78 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// Tuning parameters for the diagonal matrix inversion kernels +// +// ================================================================================================= + +namespace clblast { +namespace database { +// ================================================================================================= + +const Database::DatabaseEntry InvertHalf = { + "Invert", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"INTERNAL_BLOCK_SIZE",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry InvertSingle = { + "Invert", Precision::kSingle, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"INTERNAL_BLOCK_SIZE",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry InvertComplexSingle = { + "Invert", Precision::kComplexSingle, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"INTERNAL_BLOCK_SIZE",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry InvertDouble = { + "Invert", Precision::kDouble, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"INTERNAL_BLOCK_SIZE",16} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry InvertComplexDouble = { + "Invert", Precision::kComplexDouble, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"INTERNAL_BLOCK_SIZE",16} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace database +} // namespace clblast diff --git a/src/database/kernels/xtrsv.hpp b/src/database/kernels/xtrsv.hpp new file mode 100644 index 00000000..0741569e --- /dev/null +++ b/src/database/kernels/xtrsv.hpp @@ -0,0 +1,78 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file populates the database with best-found tuning parameters for the 'Xtrsv' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { +// ================================================================================================= + +const Database::DatabaseEntry XtrsvHalf = { + "Xtrsv", Precision::kHalf, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRSV_BLOCK_SIZE",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry XtrsvSingle = { + "Xtrsv", Precision::kSingle, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRSV_BLOCK_SIZE",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry XtrsvComplexSingle = { + "Xtrsv", Precision::kComplexSingle, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRSV_BLOCK_SIZE",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry XtrsvDouble = { + "Xtrsv", Precision::kDouble, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRSV_BLOCK_SIZE",32} } }, + } + }, + } +}; + +// ================================================================================================= + +const Database::DatabaseEntry XtrsvComplexDouble = { + "Xtrsv", Precision::kComplexDouble, { + { // Default + kDeviceTypeAll, "default", { + { "default", { {"TRSV_BLOCK_SIZE",32} } }, + } + }, + } +}; + +// ================================================================================================= +} // namespace database +} // namespace clblast diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index c7743f90..32e3fbb9 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -160,6 +160,13 @@ R"( #define AbsoluteValue(value) value = fabs(value) #endif +// Negation (component-wise) +#if PRECISION == 3232 || PRECISION == 6464 + #define Negate(value) value.x = -(value.x); value.y = -(value.y) +#else + #define Negate(value) value = -(value) +#endif + // Adds two complex variables #if PRECISION == 3232 || PRECISION == 6464 #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y @@ -167,6 +174,13 @@ R"( #define Add(c, a, b) c = a + b #endif +// Subtracts two complex variables +#if PRECISION == 3232 || PRECISION == 6464 + #define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y +#else + #define Subtract(c, a, b) c = a - b +#endif + // Multiply two complex variables (used in the defines below) #if PRECISION == 3232 || PRECISION == 6464 #define MulReal(a, b) a.x*b.x - a.y*b.y @@ -191,6 +205,20 @@ R"( #endif #endif +// The scalar multiply-subtract function +#if PRECISION == 3232 || PRECISION == 6464 + #define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b) +#else + #define MultiplySubtract(c, a, b) c -= a * b +#endif + +// The scalar division function: full division +#if PRECISION == 3232 || PRECISION == 6464 + #define DivideFull(c, a, b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom +#else + #define DivideFull(c, a, b) c = a / b +#endif + // The scalar AXPBY function #if PRECISION == 3232 || PRECISION == 6464 #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d) diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl new file mode 100644 index 00000000..ebea77a3 --- /dev/null +++ b/src/kernels/level2/xtrsv.opencl @@ -0,0 +1,144 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains kernels to perform forward or backward substition, as used in the TRSV routine +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_TRSV) + +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) +void FillVector(const int n, const int inc, const int offset, + __global real* restrict dest, const real_arg arg_value) { + const real value = GetRealArg(arg_value); + const int tid = get_global_id(0); + if (tid < n) { + dest[tid*inc + offset] = value; + } +} + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. + +#ifndef TRSV_BLOCK_SIZE + #define TRSV_BLOCK_SIZE 32 // The block size for forward or backward substition +#endif + +// ================================================================================================= + +__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) +void trsv_forward(int n, + const __global real *A, const int a_offset, int a_ld, + __global real *b, const int b_offset, int b_inc, + __global real *x, const int x_offset, int x_inc, + const int is_transposed, const int is_unit_diagonal, const int do_conjugate) { + __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE]; + __local real xlm[TRSV_BLOCK_SIZE]; + const int tid = get_local_id(0); + + // Pre-loads the data into local memory + if (tid < n) { + Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]); + if (is_transposed == 0) { + for (int i = 0; i < n; ++i) { + alm[i][tid] = A[i + tid*a_ld + a_offset]; + } + } + else { + for (int i = 0; i < n; ++i) { + alm[i][tid] = A[tid + i*a_ld + a_offset]; + } + } + if (do_conjugate) { + for (int i = 0; i < n; ++i) { + COMPLEX_CONJUGATE(alm[i][tid]); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Computes the result (single-threaded for now) + if (tid == 0) { + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + MultiplySubtract(xlm[i], alm[i][j], xlm[j]); + } + if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Stores the results + if (tid < n) { + x[tid*x_inc + x_offset] = xlm[tid]; + } +} + +__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) +void trsv_backward(int n, + const __global real *A, const int a_offset, int a_ld, + __global real *b, const int b_offset, int b_inc, + __global real *x, const int x_offset, int x_inc, + const int is_transposed, const int is_unit_diagonal, const int do_conjugate) { + __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE]; + __local real xlm[TRSV_BLOCK_SIZE]; + const int tid = get_local_id(0); + + // Pre-loads the data into local memory + if (tid < n) { + Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]); + if (is_transposed == 0) { + for (int i = 0; i < n; ++i) { + alm[i][tid] = A[i + tid*a_ld + a_offset]; + } + } + else { + for (int i = 0; i < n; ++i) { + alm[i][tid] = A[tid + i*a_ld + a_offset]; + } + } + if (do_conjugate) { + for (int i = 0; i < n; ++i) { + COMPLEX_CONJUGATE(alm[i][tid]); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Computes the result (single-threaded for now) + if (tid == 0) { + for (int i = n - 1; i >= 0; --i) { + for (int j = i + 1; j < n; ++j) { + MultiplySubtract(xlm[i], alm[i][j], xlm[j]); + } + if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Stores the results + if (tid < n) { + x[tid*x_inc + x_offset] = xlm[tid]; + } +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl new file mode 100644 index 00000000..c59bcbcb --- /dev/null +++ b/src/kernels/level3/invert_diagonal_blocks.opencl @@ -0,0 +1,429 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based +// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular +// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, +// and Jack Dongarra. +// +// ================================================================================================= +// +// Let A be an block_size*block_size lower triangular matrix, and B its inverse. +// Then the block decomposition +// +// [ A11 0 ] * [ B11 0 ] = [ I 0 ] +// [ A21 A22 ] [ B21 B22 ] [ 0 I ] +// +// yields +// +// A11*B11 = I ==> B11 = A11^{-1}, +// A22*B22 = I ==> B22 = A22^{-1}, +// A21*B11 + A22*B21 = 0 ==> B21 = -A22^{-1}*A21*B11 = -B22*A21*B11. +// +// The InvertDiagonalBlock kernel inverts A11 and A22. +// The TripleMatMul routines multiply: +// part 1: B21 = A21 * B11, +// part 2: B21 = -B22 * B21. +// +// At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each +// submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2 +// at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block, +// here current_size*2=32, +// [ B11 0 ] +// which contains [ B21 B22 ]. +// Outer blocks are block_size x block_size. +// +// A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on +// the right. This makes a single check easy to do. +// +// B is stored in workspace that is a full multiple of block_size x block_size; no checks needed. +// +// We split this into part1 & part2 to synchronize all blocks and make sure +// that writes to B12 are observed by all blocks. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_INVERT) + +#define LOCALX 17 // 16 + 1 to avoid bank conflicts +#define LOCALY 16 + +// ================================================================================================= + +// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix +__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1))) +void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld, + __global real* restrict dest, const int outer_block_size, + const int unit_diagonal, const int is_upper) +{ + const int thread_index = get_local_id(0); + const int block_index = get_group_id(0); + + // Sets the offset for this particular block in the source and destination matrices + const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset; + const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE; + const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block, + (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that + + // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE + __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; + + // Loads the source lower triangle into local memory. Any values in the upper triangle or + // outside of the matrix are set to zero + #pragma unroll + for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) : + (thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n); + if (condition) { + lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset]; + } + else { + SetToZero(lm[thread_index][j]); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Inverts the diagonal + real inverted_diagonal; + SetToOne(inverted_diagonal); + if (unit_diagonal == 0) { + const real diagonal_value = lm[thread_index][thread_index]; + if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix + real constant_one; + SetToOne(constant_one); + DivideFull(inverted_diagonal, constant_one, diagonal_value); + } + } + lm[thread_index][thread_index] = inverted_diagonal; + barrier(CLK_LOCAL_MEM_FENCE); + + // Upper-triangular + if (is_upper) { + + // Computes the elements 0:j-1 of the j-th column + for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) { + if (thread_index < j) { + real sum; + SetToZero(sum); + #pragma unroll + for (int k = 0; k < j; ++k) { + MultiplyAdd(sum, lm[thread_index][k], lm[k][j]); + } + real diagonal_value = lm[j][j]; + Negate(diagonal_value); + Multiply(lm[thread_index][j], diagonal_value, sum); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + + // Lower triangular + else { + + // Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column + for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) { + if (thread_index > j) { + real sum; + SetToZero(sum); + #pragma unroll + for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) { + MultiplyAdd(sum, lm[thread_index][k], lm[k][j]); + } + Multiply(lm[thread_index][j], -lm[j][j], sum); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + + // Writes the result to global memory + #pragma unroll + for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) { + dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j]; + } +} + +// ================================================================================================= + +// Triple matrix-multiplication kernel: C = A * B +inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n, + __global const real* agm, __global const real* bgm, __global real* cgm, + const int lda, const int ldb, const int ldc, + int current_size, int num_pages, const int block_size) { + + // Emulates a 3D grid: NX * (NY * num_pages) + const int by = get_group_id(1) / num_pages; + const int page = get_group_id(1) % num_pages; + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); + const int iby = by*16; + const int id = lidx + lidy*get_local_size(0); + const int row = page*current_size*2 + current_size + ibx + id; + int col = page*current_size*2 + current_size; + + // Sets the offsets for this specific thread + agm += ibx + id; + bgm += lidx + (iby + lidy)*ldb; + cgm += ibx + id + iby*ldc; + + // Initializes the result registers + real cpm[16]; + #pragma unroll + for (int j = 0; j < 16; ++j) { + SetToZero(cpm[j]); + } + + // Computes NT x 16 block of C, each thread computes one 1 x 16 row + for (int k = 0; k < current_size; k += 16) { + + // Loads a 16 x 16 block of B into local memory using NX x 4 threads + #pragma unroll + for( int i=0; i < 16; i += (size/4) ) { // += get_local_size(0) + #pragma unroll + for( int j=0; j < 16; j += 4 ) { // += get_local_size(1) + blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Upper triangular + if (upper) { + + // Performs 16 x 16 multiply-add operations + #pragma unroll + for (int i = 0; i < 16; ++i) { + if (part == 2 || col++ < n) { + #pragma unroll + for (int j = 0; j < 16; ++j) { + MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]); + } + } + } + } + + // Lower triangular + else { + if (row < n) { + + // Performs 16 x 16 multiply-add operations + #pragma unroll + for (int i = 0; i < 16; ++i) { + #pragma unroll + for (int j = 0; j < 16; ++j) { + MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]); + } + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Stores NT x 16 results: each thread writes one 16 x 1 row + #pragma unroll + for (int i = 0; i < 16; ++i) { + if (part == 2) { Negate(cpm[i]); } + cgm[0] = cpm[i]; + cgm += ldc; + } +} + +// ================================================================================================= + +// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower) +inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n, + __global const real* src, const int a_offset, const int lda, + __global real* dest, int current_size, int num_pages, const int block_size) { + + // Emulates a 3D grid: NX * (NY * num_pages) + const int page = get_group_id(1) % num_pages; + + // Computes the destination block offset: + // - go to the (page / pages_per_block) outer block_size * block_size block + // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that + const int pages_per_block = block_size / (current_size*2); + dest += (page / pages_per_block) * block_size * block_size + + (page % pages_per_block) * (current_size*2*block_size + current_size*2); + + // Using the GEMM notation: C = A*B + __global const real* agm; + __global const real* bgm; + __global real* cgm; + if (upper) { // upper triangular: B12 = A12 * B22 + agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda; // A12 + bgm = dest + current_size*block_size + current_size; // B22 + cgm = dest + current_size*block_size; // B12 + } + else { // lower triangular: B21 = A21 * B11 + agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size; // A21 + bgm = dest; // B11 + cgm = dest + current_size; // B21 + } + + // Runs the generic C = A * B matrix multiplication + const int ldb = block_size; + const int ldc = block_size; + TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); +} + +// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower) +inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n, + __global real* dest, int current_size, int num_pages, const int block_size) { + + // Emulates a 3D grid: NX * (NY * num_pages) + const int page = get_group_id(1) % num_pages; + + // Computes the destination block offset: + // - go to the (page / pages_per_block) outer block_size * block_size block + // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that + const int pages_per_block = block_size / (current_size*2); + dest += (page / pages_per_block) * block_size * block_size + + (page % pages_per_block) * (current_size*2*block_size + current_size*2); + + // Using the GEMM notation: C = A*B + __global const real* agm; + __global const real* bgm; + __global real* cgm; + if (upper) { // upper triangular: B12 = -B11 * B12 + agm = dest; // B11 + cgm = dest + current_size*block_size; // B12 + bgm = cgm; // B12, okay to overwrite + } + + else { // lower triangular: B21 = -B22 * B21 + agm = dest + current_size*block_size + current_size; // B22 + cgm = dest + current_size; // B21 + bgm = cgm; // B21, okay to overwrite + } + + // Runs the generic C = A * B matrix multiplication + const int lda = block_size; + const int ldb = block_size; + const int ldc = block_size; + TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size); +} + +// ================================================================================================= + +// B21 = A21 * B11 +__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B21 = -B22 * B21 +__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size); +} + +// B21 = A21 * B11 +__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B21 = -B22 * B21 +__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size); +} + +// B21 = A21 * B11 +__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B21 = -B22 * B21 +__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size); +} + +// ================================================================================================= + +// B12 = A12 * B22 +__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B12 = -B11 * B12 +__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size); +} + +// B12 = A12 * B22 +__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B12 = -B11 * B12 +__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size); +} + +// B12 = A12 * B22 +__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, + __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size); +} + +// B12 = -B11 * B12 +__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) +{ + __local real lm[LOCALY * LOCALX]; + TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size); +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl index bf14ab12..5ba8cf29 100644 --- a/src/kernels/level3/level3.opencl +++ b/src/kernels/level3/level3.opencl @@ -73,6 +73,22 @@ R"( #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts #endif +// ================================================================================================= +#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM) + +__kernel __attribute__((reqd_work_group_size(8, 8, 1))) +void FillMatrix(const int m, const int n, const int ld, const int offset, + __global real* restrict dest, const real_arg arg_value) { + const real value = GetRealArg(arg_value); + const int id_one = get_global_id(0); + const int id_two = get_global_id(1); + if (id_one < m && id_two < n) { + dest[id_two*ld + id_one + offset] = value; + } +} + +#endif + // ================================================================================================= // End of the C++11 raw string literal diff --git a/src/routine.cpp b/src/routine.cpp index 3cd045c8..b5823bc9 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -25,16 +25,17 @@ namespace clblast { const std::vector Routine::routines_axpy = {"AXPY", "COPY", "SCAL", "SWAP"}; const std::vector Routine::routines_dot = {"AMAX", "ASUM", "DOT", "DOTC", "DOTU", "MAX", "MIN", "NRM2", "SUM"}; const std::vector Routine::routines_ger = {"GER", "GERC", "GERU", "HER", "HER2", "HPR", "HPR2", "SPR", "SPR2", "SYR", "SYR2"}; -const std::vector Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV"}; +const std::vector Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV", "TRSV"}; const std::vector Routine::routines_gemm = {"GEMM", "HEMM", "SYMM", "TRMM"}; -const std::vector Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM"}; +const std::vector Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM", "TRSM"}; +const std::vector Routine::routines_trsm = {"TRSM"}; const std::unordered_map> Routine::routines_by_kernel = { {"Xaxpy", routines_axpy}, {"Xdot", routines_dot}, {"Xgemv", routines_gemv}, {"XgemvFast", routines_gemv}, {"XgemvFastRot", routines_gemv}, - {"Xgemv", {}}, + {"Xtrsv", routines_gemv}, {"Xger", routines_ger}, {"Copy", routines_gemm_syrk}, {"Pad", routines_gemm_syrk}, @@ -43,6 +44,7 @@ const std::unordered_map> Routine::r {"Xgemm", routines_gemm_syrk}, {"XgemmDirect", routines_gemm}, {"KernelSelection", routines_gemm}, + {"Invert", routines_trsm}, }; // ================================================================================================= diff --git a/src/routine.hpp b/src/routine.hpp index 622a1c0d..eb11b566 100644 --- a/src/routine.hpp +++ b/src/routine.hpp @@ -50,6 +50,7 @@ class Routine { static const std::vector routines_gemv; static const std::vector routines_gemm; static const std::vector routines_gemm_syrk; + static const std::vector routines_trsm; static const std::unordered_map> routines_by_kernel; private: diff --git a/src/routines/common.hpp b/src/routines/common.hpp index d268d58b..be6ac4ec 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -33,6 +33,47 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device, // ================================================================================================= +// Sets all elements of a matrix to a constant value +template +void FillMatrix(Queue &queue, const Device &device, + const Program &program, const Databases &, + EventPointer event, const std::vector &waitForEvents, + const size_t m, const size_t n, const size_t ld, const size_t offset, + const Buffer &dest, + const T constant_value) { + auto kernel = Kernel(program, "FillMatrix"); + kernel.SetArgument(0, static_cast(m)); + kernel.SetArgument(1, static_cast(n)); + kernel.SetArgument(2, static_cast(ld)); + kernel.SetArgument(3, static_cast(offset)); + kernel.SetArgument(4, dest()); + kernel.SetArgument(5, GetRealArg(constant_value)); + auto local = std::vector{8, 8}; + auto global = std::vector{Ceil(m, 8), Ceil(n, 8)}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); +} + +// Sets all elements of a vector to a constant value +template +void FillVector(Queue &queue, const Device &device, + const Program &program, const Databases &, + EventPointer event, const std::vector &waitForEvents, + const size_t n, const size_t inc, const size_t offset, + const Buffer &dest, + const T constant_value) { + auto kernel = Kernel(program, "FillVector"); + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, static_cast(inc)); + kernel.SetArgument(2, static_cast(offset)); + kernel.SetArgument(3, dest()); + kernel.SetArgument(4, GetRealArg(constant_value)); + auto local = std::vector{64}; + auto global = std::vector{Ceil(n, 64)}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); +} + +// ================================================================================================= + // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able // to write to symmetric and triangular matrices through optional arguments. template diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index aae66798..3b5b5e8b 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -22,9 +22,10 @@ namespace clblast { // Constructor: forwards to base class constructor template Xgemv::Xgemv(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue(), {}, { + Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot", "Xtrsv"}, PrecisionValue(), {}, { #include "../../kernels/level2/xgemv.opencl" #include "../../kernels/level2/xgemv_fast.opencl" + #include "../../kernels/level2/xtrsv.opencl" }) { } diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp new file mode 100644 index 00000000..d5d009ff --- /dev/null +++ b/src/routines/level2/xtrsv.cpp @@ -0,0 +1,161 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrsv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xtrsv.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtrsv::Xtrsv(Queue &queue, EventPointer event, const std::string &name): + Xgemv(queue, event, name) { +} + +// ================================================================================================= + +template +void Xtrsv::Substitution(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_inc, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); }; + + // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel + const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1; + const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) || + (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1; + const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0; + + // The data is either in the upper or lower triangle + const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) || + (triangle == Triangle::kLower && a_transpose != Transpose::kNo)); + + // Retrieves the kernel from the compiled binary + const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward"; + auto kernel = Kernel(program_, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, a_buffer()); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, static_cast(a_ld)); + kernel.SetArgument(4, b_buffer()); + kernel.SetArgument(5, static_cast(b_offset)); + kernel.SetArgument(6, static_cast(b_inc)); + kernel.SetArgument(7, x_buffer()); + kernel.SetArgument(8, static_cast(x_offset)); + kernel.SetArgument(9, static_cast(x_inc)); + kernel.SetArgument(10, static_cast(is_transposed)); + kernel.SetArgument(11, static_cast(is_unit_diagonal)); + kernel.SetArgument(12, static_cast(do_conjugate)); + + // Launches the kernel + const auto local = std::vector{db_["TRSV_BLOCK_SIZE"]}; + const auto global = std::vector{1}; + auto event = Event(); + RunKernel(kernel, queue_, device_, global, local, event.pointer()); + event.WaitForCompletion(); +} + +// ================================================================================================= + +// The main routine +template +void Xtrsv::DoTrsv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } + + // Tests the matrix and vector + TestMatrixA(n, n, a_buffer, a_offset, a_ld); + TestVectorX(n, b_buffer, b_offset, b_inc); + + // Creates a copy of B to avoid overwriting input while computing output + // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels + const auto x_offset = b_offset; + const auto x_inc = b_inc; + const auto x_size = n*x_inc + x_offset; + auto x_buffer = Buffer(context_, x_size); + b_buffer.CopyTo(queue_, x_size, x_buffer); + + // Fills the output buffer with zeros + auto eventWaitList = std::vector(); + auto fill_vector_event = Event(); + FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList, + n, x_inc, x_offset, x_buffer, ConstantZero()); + fill_vector_event.WaitForCompletion(); + + // Derives properties based on the arguments + const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) || + (triangle == Triangle::kLower && a_transpose != Transpose::kNo)); + const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) || + (layout != Layout::kColMajor && a_transpose != Transpose::kNo)); + + // Loops over the blocks + auto col = n; // the initial column position + for (auto i = size_t{0}; i < n; i += db_["TRSV_BLOCK_SIZE"]) { + const auto block_size = std::min(db_["TRSV_BLOCK_SIZE"], n - i); + + // Sets the next column position + col = (is_upper) ? col - block_size : i; + + // Sets the offsets for upper or lower triangular + const auto extra_offset_a = (is_transposed) ? + (is_upper ? col + (col+block_size)*a_ld : col) : + (is_upper ? col+block_size + col*a_ld : col*a_ld); + const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0; + const auto extra_offset_b = col*x_inc; + + // Runs the GEMV routine to compute x' = A * x + if (i > 0) { + const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i; + const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size; + DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne(), + a_buffer, a_offset + extra_offset_a, a_ld, + x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne(), + x_buffer, x_offset + extra_offset_b, x_inc ); + } + + // Runs the triangular substitution for the block size + Substitution(layout, triangle, a_transpose, diagonal, block_size, + a_buffer, a_offset + col + col*a_ld, a_ld, + b_buffer, b_offset + col*b_inc, b_inc, + x_buffer, x_offset + col*x_inc, x_inc); + } + + // Retrieves the results + x_buffer.CopyTo(queue_, x_size, b_buffer); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrsv; +template class Xtrsv; +template class Xtrsv; +template class Xtrsv; +template class Xtrsv; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp new file mode 100644 index 00000000..67e626a1 --- /dev/null +++ b/src/routines/level2/xtrsv.hpp @@ -0,0 +1,60 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular +// forward and backward substitutions on the diagonal parts of the matrix in combination with larger +// GEMV computation on the remainder of the matrix. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTRSV_H_ +#define CLBLAST_ROUTINES_XTRSV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xtrsv: public Xgemv { + public: + + // Uses the generic matrix-vector routine + using Xgemv::queue_; + using Xgemv::context_; + using Xgemv::device_; + using Xgemv::db_; + using Xgemv::program_; + using Xgemv::DoGemv; + + // Constructor + Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV"); + + // Templated-precision implementation of the routine + void DoTrsv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); + + // Performs forward or backward substitution on a small triangular matrix + void Substitution(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_inc, + const Buffer &x_buffer, const size_t offset_x, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTRSV_H_ +#endif diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 7bd388c1..dc8c64bc 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -33,10 +33,11 @@ Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/convert_symmetric.opencl" #include "../../kernels/level3/convert_triangular.opencl" #include "../../kernels/level3/convert_hermitian.opencl" + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_direct_part1.opencl" #include "../../kernels/level3/xgemm_direct_part2.opencl" #include "../../kernels/level3/xgemm_direct_part3.opencl" - , // separated in two parts to prevent C1091 in MSVC 2013 + , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp new file mode 100644 index 00000000..b734dd2d --- /dev/null +++ b/src/routines/level3/xtrsm.cpp @@ -0,0 +1,227 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based +// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular +// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, +// and Jack Dongarra and the OpenCL implementation in clBLAS. +// +// ================================================================================================= + +#include "routines/level3/xtrsm.hpp" +#include "routines/levelx/xinvert.hpp" + +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xtrsm::Xtrsm(Queue &queue, EventPointer event, const std::string &name): + Xgemm(queue, event, name) { +} + +// ================================================================================================= + +// The entry point: transforming into col-major (if needed) and then running the col-major version +template +void Xtrsm::DoTrsm(const Layout layout, Side side, Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + size_t m, size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { + + // Converts row-major to a col-major problem: + // The idea is that + // B = A*X + // can be computed as + // B' = (A*X)' = X'*A' + // Since changing the order is basically a transpose on each matrix, the formula becomes: + // B = X*A + // So only the side (left/right) and the triangle (upper/lower) are changed and M/N are swapped + if (layout == Layout::kRowMajor) { + std::swap(m, n); + side = (side == Side::kLeft) ? Side::kRight : Side::kLeft; + triangle = (triangle == Triangle::kLower) ? Triangle::kUpper : Triangle::kLower; + } + + // Runs the col-major version of TRSM + TrsmColMajor(side, triangle, a_transpose, diagonal, + m, n, alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld); +} + +// ================================================================================================= + +// The main routine +template +void Xtrsm::TrsmColMajor(const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { + + // Settings + constexpr auto block_size = size_t{32}; // tuneable + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } + + // Computes the k dimension. This is based on whether or not matrix is A (on the left) + // or B (on the right) in the Xgemm routine. + const auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the triangular A matrix + TestMatrixA(k, k, a_buffer, a_offset, a_ld); + + // Checks for validity of the input B matrix + TestMatrixB(m, n, b_buffer, b_offset, b_ld); + + // Creates a copy of B to avoid overwriting input in GEMM while computing output + const auto b_size = b_ld * (n - 1) + m + b_offset; + const auto x_one = m; + const auto x_two = n; + const auto x_size = b_size; + const auto x_ld = b_ld; + const auto x_offset = b_offset; + auto x_buffer = Buffer(context_, x_size); + b_buffer.CopyTo(queue_, x_size, x_buffer); + + // Temporary buffer for the inverse of the A matrix + const auto a_inv_size = Ceil(k, block_size) * block_size; + auto a_inv_buffer = Buffer(context_, a_inv_size); + + // Fills the output buffer with zeros + auto eventWaitList = std::vector(); + auto fill_matrix_event = Event(); + FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList, + x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero()); + fill_matrix_event.WaitForCompletion(); + + // Inverts the diagonal blocks + auto diagonal_invert_event = Event(); + auto inverter = Xinvert(queue_, diagonal_invert_event.pointer()); + inverter.InvertMatrixDiagonalBlocks(Layout::kColMajor, triangle, diagonal, + k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer); + diagonal_invert_event.WaitForCompletion(); + + // Derives properties based on the arguments + const auto condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) || + (triangle == Triangle::kLower && a_transpose == Transpose::kNo)); + + // Left side + if (side == Side::kLeft) { + + // True when (lower triangular) or (upper triangular and transposed) + if (condition) { + for (auto i = size_t{0}; i < m; i += block_size) { + const auto gemm_alpha = (i == 0) ? alpha : ConstantOne(); + const auto current_block_size = std::min(m - i, block_size); + DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, + current_block_size, n, current_block_size, gemm_alpha, + a_inv_buffer, i * block_size, block_size, + b_buffer, i, b_ld, ConstantZero(), + x_buffer, i, x_ld); + if (i + block_size >= m) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld; + DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, + m - i - block_size, n, block_size, ConstantNegOne(), + a_buffer, this_a_offset, a_ld, + x_buffer, i, x_ld, ConstantOne(), + b_buffer, i + block_size, b_ld); + } + } + + // True when (upper triangular) or (lower triangular and transposed) + else { + const auto current_block_size = (m % block_size == 0) ? block_size : (m % block_size); + const auto i_start = static_cast(m) - static_cast(current_block_size); + for (auto i = i_start; i >= 0; i -= static_cast(block_size)) { + const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne(); + DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, + current_block_size, n, current_block_size, gemm_alpha, + a_inv_buffer, i * block_size, block_size, + b_buffer, i, b_ld, ConstantZero(), + x_buffer, i, x_ld); + if (i - static_cast(block_size) < 0) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i; + DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, + i, n, block_size, ConstantNegOne(), + a_buffer, this_a_offset, a_ld, + x_buffer, i, x_ld, ConstantOne(), + b_buffer, 0, b_ld); + } + } + } + + // Right side + else { + + // True when (lower triangular) or (upper triangular and transposed) + if (condition) { + const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size); + const auto i_start = static_cast(n) - static_cast(current_block_size); + for (auto i = i_start; i >= 0; i -= static_cast(block_size)) { + const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne(); + DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, + m, current_block_size, current_block_size, gemm_alpha, + b_buffer, i * b_ld, b_ld, + a_inv_buffer, i * block_size, block_size, ConstantZero(), + x_buffer, i * x_ld, x_ld); + if (i - static_cast(block_size) < 0) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld; + DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, + m, i, current_block_size, ConstantNegOne(), + x_buffer, i * x_ld, x_ld, + a_buffer, this_a_offset, a_ld, ConstantOne(), + b_buffer, 0, b_ld); + } + } + + // True when (upper triangular) or (lower triangular and transposed) + else { + for (auto i = size_t{0}; i < n; i += block_size) { + const auto gemm_alpha = (i == 0) ? alpha : ConstantOne(); + const auto current_block_size = std::min(n - i, block_size); + DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, + m, current_block_size, current_block_size, gemm_alpha, + b_buffer, i * b_ld, b_ld, + a_inv_buffer, i * block_size, block_size, ConstantZero(), + x_buffer, i * x_ld, x_ld); + if (i + block_size >= n) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld; + DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, + m, n - i - block_size, block_size, ConstantNegOne(), + x_buffer, i * x_ld, x_ld, + a_buffer, this_a_offset, a_ld, ConstantOne(), + b_buffer, (i + block_size) * b_ld, b_ld); + } + } + } + + // Retrieves the results + x_buffer.CopyTo(queue_, b_size, b_buffer); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrsm; +template class Xtrsm; +template class Xtrsm; +template class Xtrsm; +template class Xtrsm; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp new file mode 100644 index 00000000..5b42398e --- /dev/null +++ b/src/routines/level3/xtrsm.hpp @@ -0,0 +1,60 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the Xtrsm routine. The implementation is based on ??? (TODO). +// Therefore, this class inherits from the Xgemm class. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTRSM_H_ +#define CLBLAST_ROUTINES_XTRSM_H_ + +#include "routines/level3/xgemm.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class Xtrsm: public Xgemm { + public: + + // Uses methods and variables the Xgemm routine + using Xgemm::queue_; + using Xgemm::context_; + using Xgemm::device_; + using Xgemm::db_; + using Xgemm::program_; + using Xgemm::DoGemm; + + // Constructor + Xtrsm(Queue &queue, EventPointer event, const std::string &name = "TRSM"); + + // Templated-precision implementation of the routine + void DoTrsm(const Layout layout, Side side, Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + size_t m, size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); + + // Implementation of the column-major version + void TrsmColMajor(const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTRSM_H_ +#endif diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp new file mode 100644 index 00000000..5c21d5ce --- /dev/null +++ b/src/routines/levelx/xinvert.cpp @@ -0,0 +1,151 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains all the common code to perform (partial) matrix inverting. This code is based +// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular +// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, +// and Jack Dongarra. +// +// ================================================================================================= + +#include "routines/levelx/xinvert.hpp" + +#include +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xinvert::Xinvert(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Invert"}, PrecisionValue(), {}, { + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/invert_diagonal_blocks.opencl" + }) { +} + +// ================================================================================================= + +// Inverts diagonal square blocks of a matrix +template +void Xinvert::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag, + const size_t n, const size_t block_size, + const Buffer &src, const size_t offset, const size_t ld_src, + Buffer &dest) { + + // Makes sure all dimensions are larger than zero + if ((block_size == 0) || (n == 0)) { + throw BLASError(StatusCode::kInvalidDimension); + } + + // Helper variables + const auto internal_block_size = static_cast(db_["INTERNAL_BLOCK_SIZE"]); + assert(internal_block_size == 16); + const auto num_blocks = CeilDiv(n, block_size); + const auto num_internal_blocks = CeilDiv(n, internal_block_size); + const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false; + + // This routine only supports block sizes which are a multiple of the internal block size and + // block sizes up to and including 128 + if ((block_size % internal_block_size != 0) || (block_size > 128)) { + throw BLASError(StatusCode::kUnknownError); + } + + // Checks for validity of the source and destination matrices + TestMatrixA(n, n, src, offset, ld_src); + TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size); + + // Determines which kernels to run based on the layout (the kernels assume column-major as + // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix + const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + const auto name_postfix = (is_upper) ? "Upper" : "Lower"; + + // Fills the output buffer with zeros + auto event_wait_list = std::vector(); + auto fill_matrix_event = Event(); + FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list, + block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero()); + event_wait_list.push_back(fill_matrix_event); + + // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group + auto kernel = Kernel(program_, "InvertDiagonalBlock"); + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, src()); + kernel.SetArgument(2, static_cast(offset)); + kernel.SetArgument(3, static_cast(ld_src)); + kernel.SetArgument(4, dest()); + kernel.SetArgument(5, static_cast(block_size)); + kernel.SetArgument(6, static_cast(unit_diagonal)); + kernel.SetArgument(7, static_cast(is_upper)); + const auto local = std::vector{internal_block_size}; + const auto global = std::vector{num_internal_blocks * internal_block_size}; + auto base_kernel_event = Event(); + auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer(); + RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list); + if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); } + + // Builds up block_size x block_size blocks. For example, internal_block_size=16: + // use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads; + // then 32 x 32 blocks to build 64 x 64 blocks, 1 x (2 x npages) grid, 8 x 4 threads; + // then 64 x 64 blocks to build 128 x 128 blocks, 1 x (4 x npages) grid, 16 x 4 threads; + for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) { + assert(current_size == 16 || current_size == 32 || current_size == 64); + + // Emulates a 3D grid: NX * (NY * npages) + const auto npages = CeilDiv(n, current_size*2); + const auto local0 = (current_size <= 32) ? current_size/4 : 16; + const auto local = std::vector{local0, 4}; + const auto global = std::vector{(current_size/local[1]), npages*(current_size/16)*local[1]}; + + // Part 1 + auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix); + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, src()); + kernel1.SetArgument(2, static_cast(offset)); + kernel1.SetArgument(3, static_cast(ld_src)); + kernel1.SetArgument(4, dest()); + kernel1.SetArgument(5, static_cast(current_size)); + kernel1.SetArgument(6, static_cast(npages)); + kernel1.SetArgument(7, static_cast(block_size)); + auto kernel1_event = Event(); + RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list); + event_wait_list.push_back(kernel1_event); + + // Part 2 + const bool is_last_kernel = (current_size * 2 >= block_size); + auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix); + kernel2.SetArgument(0, static_cast(n)); + kernel2.SetArgument(1, dest()); + kernel2.SetArgument(2, static_cast(current_size)); + kernel2.SetArgument(3, static_cast(npages)); + kernel2.SetArgument(4, static_cast(block_size)); + auto kernel2_event = Event(); + auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer(); + RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list); + if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); } + + // Exit in case we reach beyond the bounds of the input matrix + if (current_size*2 >= n) { break; } + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xinvert; +template class Xinvert; +template class Xinvert; +template class Xinvert; +template class Xinvert; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/levelx/xinvert.hpp b/src/routines/levelx/xinvert.hpp new file mode 100644 index 00000000..fa0a80e7 --- /dev/null +++ b/src/routines/levelx/xinvert.hpp @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains all the common code to perform (partial) matrix inverting. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XINVERT_H_ +#define CLBLAST_ROUTINES_XINVERT_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +template +class Xinvert: public Routine { + public: + + // Constructor + Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT"); + + // Inverts diagonal square blocks of a matrix + void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag, + const size_t n, const size_t block_size, + const Buffer &src, const size_t offset, const size_t ld_src, + Buffer &dest); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XINVERT_H_ +#endif diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index 5e445bb9..9cf75490 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -18,57 +18,74 @@ #include #include #include +#include namespace clblast { // ================================================================================================= // Returns a scalar with a default value -template -T GetScalar() { - return static_cast(2.0); -} +template T GetScalar() { return static_cast(2.0); } template float GetScalar(); template double GetScalar(); +template <> half GetScalar() { return FloatToHalf(2.0f); } +template <> float2 GetScalar() { return {2.0f, 0.5f}; } +template <> double2 GetScalar() { return {2.0, 0.5}; } -// Specialized version of the above for half-precision -template <> -half GetScalar() { - return FloatToHalf(2.0f); -} - -// Specialized versions of the above for complex data-types -template <> -float2 GetScalar() { - return {2.0f, 0.5f}; -} -template <> -double2 GetScalar() { - return {2.0, 0.5}; -} +// Returns a scalar of value 0 +template T ConstantZero() { return static_cast(0.0); } +template float ConstantZero(); +template double ConstantZero(); +template <> half ConstantZero() { return FloatToHalf(0.0f); } +template <> float2 ConstantZero() { return {0.0f, 0.0f}; } +template <> double2 ConstantZero() { return {0.0, 0.0}; } // Returns a scalar of value 1 -template -T ConstantOne() { - return static_cast(1.0); -} +template T ConstantOne() { return static_cast(1.0); } template float ConstantOne(); template double ConstantOne(); +template <> half ConstantOne() { return FloatToHalf(1.0f); } +template <> float2 ConstantOne() { return {1.0f, 0.0f}; } +template <> double2 ConstantOne() { return {1.0, 0.0}; } -// Specialized version of the above for half-precision -template <> -half ConstantOne() { - return FloatToHalf(1.0f); -} +// Returns a scalar of value -1 +template T ConstantNegOne() { return static_cast(-1.0); } +template float ConstantNegOne(); +template double ConstantNegOne(); +template <> half ConstantNegOne() { return FloatToHalf(-1.0f); } +template <> float2 ConstantNegOne() { return {-1.0f, 0.0f}; } +template <> double2 ConstantNegOne() { return {-1.0, 0.0}; } -// Specialized versions of the above for complex data-types -template <> -float2 ConstantOne() { - return {1.0f, 0.0f}; -} -template <> -double2 ConstantOne() { - return {1.0, 0.0}; -} +// Returns a scalar of value 1 +template T ConstantTwo() { return static_cast(2.0); } +template float ConstantTwo(); +template double ConstantTwo(); +template <> half ConstantTwo() { return FloatToHalf(2.0f); } +template <> float2 ConstantTwo() { return {2.0f, 0.0f}; } +template <> double2 ConstantTwo() { return {2.0, 0.0}; } + +// Returns a small scalar value just larger than 0 +template T SmallConstant() { return static_cast(1e7); } +template float SmallConstant(); +template double SmallConstant(); +template <> half SmallConstant() { return FloatToHalf(1e7); } +template <> float2 SmallConstant() { return {1e7, 0.0f}; } +template <> double2 SmallConstant() { return {1e7, 0.0}; } + +// Returns the absolute value of a scalar +template T AbsoluteValue(const T value) { return std::fabs(value); } +template float AbsoluteValue(const float); +template double AbsoluteValue(const double); +template <> half AbsoluteValue(const half value) { return FloatToHalf(std::fabs(HalfToFloat(value))); } +template <> float2 AbsoluteValue(const float2 value) { return std::abs(value); } +template <> double2 AbsoluteValue(const double2 value) { return std::abs(value); } + +// Returns whether a scalar is close to zero +template bool IsCloseToZero(const T value) { return (value > -SmallConstant()) && (value < SmallConstant()); } +template bool IsCloseToZero(const float); +template bool IsCloseToZero(const double); +template <> bool IsCloseToZero(const half value) { return IsCloseToZero(HalfToFloat(value)); } +template <> bool IsCloseToZero(const float2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); } +template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value.real()) || IsCloseToZero(value.imag()); } // ================================================================================================= @@ -79,23 +96,27 @@ std::string ToString(T value) { } template std::string ToString(int value); template std::string ToString(size_t value); -template std::string ToString(float value); -template std::string ToString(double value); +template <> +std::string ToString(float value) { + std::ostringstream result; + result << std::fixed << std::setprecision(2) << value; + return result.str(); +} +template <> +std::string ToString(double value) { + std::ostringstream result; + result << std::fixed << std::setprecision(2) << value; + return result.str(); +} // If not possible directly: special cases for complex data-types template <> std::string ToString(float2 value) { - std::ostringstream real, imag; - real << std::setprecision(2) << value.real(); - imag << std::setprecision(2) << value.imag(); - return real.str()+"+"+imag.str()+"i"; + return ToString(value.real())+"+"+ToString(value.imag())+"i"; } template <> std::string ToString(double2 value) { - std::ostringstream real, imag; - real << std::setprecision(2) << value.real(); - imag << std::setprecision(2) << value.imag(); - return real.str()+"+"+imag.str()+"i"; + return ToString(value.real())+"+"+ToString(value.imag())+"i"; } // If not possible directly: special case for half-precision diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index a1d4e2db..044955ea 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -99,12 +99,20 @@ constexpr auto kArgNoAbbreviations = "no_abbrv"; // ================================================================================================= // Returns a scalar with a default value -template -T GetScalar(); +template T GetScalar(); -// Returns a scalar of value 1 -template -T ConstantOne(); +// Fixed value scalars +template T ConstantZero(); +template T ConstantOne(); +template T ConstantNegOne(); +template T ConstantTwo(); +template T SmallConstant(); + +// Returns the absolute value of a scalar +template T AbsoluteValue(const T value); + +// Returns whether a scalar is close to zero +template bool IsCloseToZero(const T value); // ================================================================================================= diff --git a/test/correctness/routines/level3/xtrsm.cpp b/test/correctness/routines/level3/xtrsm.cpp index 6119bd17..bc45a8bf 100644 --- a/test/correctness/routines/level3/xtrsm.cpp +++ b/test/correctness/routines/level3/xtrsm.cpp @@ -23,7 +23,6 @@ int main(int argc, char *argv[]) { errors += clblast::RunTests, double, double>(argc, argv, true, "DTRSM"); errors += clblast::RunTests, float2, float2>(argc, argv, true, "CTRSM"); errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZTRSM"); - errors += clblast::RunTests, half, half>(argc, argv, true, "HTRSM"); if (errors > 0) { return 1; } else { return 0; } } diff --git a/test/correctness/routines/levelx/xinvert.cpp b/test/correctness/routines/levelx/xinvert.cpp new file mode 100644 index 00000000..0ccc0829 --- /dev/null +++ b/test/correctness/routines/levelx/xinvert.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/levelx/xinvert.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests, float, float>(argc, argv, false, "SINVERT"); + errors += clblast::RunTests, double, double>(argc, argv, true, "DINVERT"); + errors += clblast::RunTests, float2, float2>(argc, argv, true, "CINVERT"); + errors += clblast::RunTests, double2, double2>(argc, argv, true, "ZINVERT"); + errors += clblast::RunTests, half, half>(argc, argv, true, "HINVERT"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index 5fddb37b..7c64855a 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -138,7 +138,10 @@ void TestBlas::TestRegular(std::vector> &test_vector, const st // Don't continue with CBLAS if there are incorrect parameters if (compare_cblas_ && status2 != StatusCode::kSuccess) { - if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; } + if (verbose_) { + fprintf(stdout, " -> %d -> ", static_cast(status2)); + std::cout << std::flush; + } TestErrorCodes(status2, status2, args); continue; } diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp index efe49811..eb79008d 100644 --- a/test/correctness/tester.cpp +++ b/test/correctness/tester.cpp @@ -40,6 +40,12 @@ float getAbsoluteErrorMargin() { return 0.10f; // especially small values are inaccurate for half-precision } +// Error margin: numbers beyond this value are considered equal to inf or NaN +template +T getAlmostInfNumber() { + return T{1e35}; // used for correctness testing of TRSV and TRSM routines +} + // Maximum number of test results printed on a single line template const size_t Tester::kResultsPerLine = size_t{64}; @@ -426,6 +432,21 @@ bool TestSimilarityNear(const T val1, const T val2, if (val1 == val2) { return true; } + // Handles cases with both results NaN or inf + else if ((std::isnan(val1) && std::isnan(val2)) || (std::isinf(val1) && std::isinf(val2))) { + return true; + } + // Also considers it OK if one of the results in NaN and the other is inf + // Note: for TRSV and TRSM routines + else if ((std::isnan(val1) && std::isinf(val2)) || (std::isinf(val1) && std::isnan(val2))) { + return true; + } + // Also considers it OK if one of the values is super large and the other is inf or NaN + // Note: for TRSV and TRSM routines + else if ((std::abs(val1) > getAlmostInfNumber() && (std::isinf(val2) || std::isnan(val2))) || + (std::abs(val2) > getAlmostInfNumber() && (std::isinf(val1) || std::isnan(val1)))) { + return true; + } // The values are zero or very small: the relative error is less meaningful else if (val1 == 0 || val2 == 0 || difference < error_margin_absolute) { return (difference < error_margin_absolute); @@ -452,15 +473,21 @@ template bool TestSimilarity(const double, const double); // Specialisations for non-standard data-types template <> bool TestSimilarity(const float2 val1, const float2 val2) { - auto real = TestSimilarity(val1.real(), val2.real()); - auto imag = TestSimilarity(val1.imag(), val2.imag()); - return (real && imag); + const auto real = TestSimilarity(val1.real(), val2.real()); + const auto imag = TestSimilarity(val1.imag(), val2.imag()); + if (real && imag) { return true; } + // also OK if one is good and the combined is good (indicates a big diff between real & imag) + if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); } + return false; // neither real nor imag is good, return false } template <> bool TestSimilarity(const double2 val1, const double2 val2) { - auto real = TestSimilarity(val1.real(), val2.real()); - auto imag = TestSimilarity(val1.imag(), val2.imag()); - return (real && imag); + const auto real = TestSimilarity(val1.real(), val2.real()); + const auto imag = TestSimilarity(val1.imag(), val2.imag()); + if (real && imag) { return true; } + // also OK if one is good and the combined is good (indicates a big diff between real & imag) + if (real || imag) { return TestSimilarity(val1.real() + val1.imag(), val2.real() + val2.imag()); } + return false; // neither real nor imag is good, return false } template <> bool TestSimilarity(const half val1, const half val2) { diff --git a/test/performance/routines/level3/xtrsm.cpp b/test/performance/routines/level3/xtrsm.cpp index ef094891..342274b7 100644 --- a/test/performance/routines/level3/xtrsm.cpp +++ b/test/performance/routines/level3/xtrsm.cpp @@ -20,8 +20,7 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { - case clblast::Precision::kHalf: - clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); case clblast::Precision::kSingle: clblast::RunClient, float, float>(argc, argv); break; case clblast::Precision::kDouble: diff --git a/test/performance/routines/levelx/xinvert.cpp b/test/performance/routines/levelx/xinvert.cpp new file mode 100644 index 00000000..87f36b1e --- /dev/null +++ b/test/performance/routines/levelx/xinvert.cpp @@ -0,0 +1,37 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/levelx/xinvert.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); + switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/routines/level2/xtrsv.hpp b/test/routines/level2/xtrsv.hpp new file mode 100644 index 00000000..fed4378a --- /dev/null +++ b/test/routines/level2/xtrsv.hpp @@ -0,0 +1,184 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xtrsv routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XTRSV_H_ +#define CLBLAST_TEST_ROUTINES_XTRSV_H_ + +#include +#include + +#ifdef CLBLAST_REF_CLBLAS + #include "test/wrapper_clblas.hpp" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "test/wrapper_cblas.hpp" +#endif + +namespace clblast { +// ================================================================================================= + +// Prepares the data +template +void PrepareData(const Arguments &args, Buffers &buffers, Queue &queue) { + if (args.a_ld < args.n) { return; } + if (args.a_size <= 0 || args.x_size <= 0) { return; } + + // Copies input buffers to the host + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + + // Generates 'proper' input for the TRSV routine + // TODO: Improve this, currently loosely based on clBLAS's implementation + for (auto i = size_t{0}; i < args.n; ++i) { + auto diagonal = a_mat_cpu[i*args.a_ld + i + args.a_offset]; + diagonal = AbsoluteValue(diagonal) + static_cast(args.n / size_t{4}); + for (auto j = size_t{0}; j < args.n; ++j) { + a_mat_cpu[j*args.a_ld + i + args.a_offset] /= ConstantTwo(); + } + a_mat_cpu[i*args.a_ld + i + args.a_offset] = diagonal; + x_vec_cpu[i * args.x_inc + args.x_offset] /= ConstantTwo(); + } + + // Copies input buffers back to the OpenCL device + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return; +} + +// See comment at top of file for a description of the class +template +class TestXtrsv { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 2; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, + kArgLayout, kArgTriangle, kArgATransp, kArgDiagonal, + kArgALeadDim, kArgXInc, + kArgAOffset, kArgXOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &args) { + return args.n * args.x_inc + args.x_offset; + } + static size_t GetSizeA(const Arguments &args) { + return args.n * args.a_ld + args.a_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.x_size = GetSizeX(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.n; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { + PrepareData(args, buffers, queue); + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trsv(args.layout, args.triangle, args.a_transpose, args.diagonal, + args.n, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec(), args.x_offset, args.x_inc, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + PrepareData(args, buffers, queue); + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrsv(convertToCLBLAS(args.layout), + convertToCLBLAS(args.triangle), + convertToCLBLAS(args.a_transpose), + convertToCLBLAS(args.diagonal), + args.n, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + PrepareData(args, buffers, queue); + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector x_vec_cpu(args.x_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + cblasXtrsv(convertToCBLAS(args.layout), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.n, + a_mat_cpu, args.a_offset, args.a_ld, + x_vec_cpu, args.x_offset, args.x_inc); + buffers.x_vec.Write(queue, args.x_size, x_vec_cpu); + return StatusCode::kSuccess; + } + #endif + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.x_size, static_cast(0)); + buffers.x_vec.Read(queue, args.x_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { + return args.n; + } + static size_t ResultID2(const Arguments &) { return 1; } // N/A for this routine + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t) { + return id1*args.x_inc + args.x_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + return 2 * args.n * args.n; + } + static size_t GetBytes(const Arguments &args) { + return (args.n*args.n + 2*args.n + args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XTRSV_H_ +#endif diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp new file mode 100644 index 00000000..1ffaef35 --- /dev/null +++ b/test/routines/level3/xtrsm.hpp @@ -0,0 +1,194 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xtrsm routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XTRSM_H_ +#define CLBLAST_TEST_ROUTINES_XTRSM_H_ + +#include +#include + +#ifdef CLBLAST_REF_CLBLAS + #include "test/wrapper_clblas.hpp" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "test/wrapper_cblas.hpp" +#endif + +namespace clblast { +// ================================================================================================= + +// Prepares the data +template +void PrepareData(const Arguments &args, Buffers &buffers, Queue &queue) { + const auto k = (args.side == Side::kLeft) ? args.m : args.n; + if (args.a_ld < k) { return; } + if (args.a_size <= 0 || args.b_size <= 0) { return; } + + // Copies input buffers to the host + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + + // Generates 'proper' input for the TRSM routine + // TODO: Improve this + for (auto i = size_t{0}; i < k; ++i) { + for (auto j = size_t{0}; j < k; ++j) { + auto value = a_mat_cpu[j*args.a_ld + i + args.a_offset]; + value *= ConstantTwo(); + if (IsCloseToZero(value)) { value += ConstantOne(); } + a_mat_cpu[j*args.a_ld + i + args.a_offset] = value; + } + } + + // Copies input buffers back to the OpenCL device + buffers.a_mat.Write(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + return; +} + +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXtrsm { + public: + + // The BLAS level: 1, 2, or 3 + static size_t BLASLevel() { return 3; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgM, kArgN, + kArgLayout, kArgSide, kArgTriangle, kArgATransp, kArgDiagonal, + kArgALeadDim, kArgBLeadDim, + kArgAOffset, kArgBOffset, + kArgAlpha}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + const auto k = (args.side == Side::kLeft) ? args.m : args.n; + return k * args.a_ld + args.a_offset; + } + static size_t GetSizeB(const Arguments &args) { + const auto b_rotated = (args.layout == Layout::kRowMajor); + const auto b_two = (b_rotated) ? args.m : args.n; + return b_two * args.b_ld + args.b_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.b_size = GetSizeB(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.m; } + static size_t DefaultLDB(const Arguments &args) { return args.n; } + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &all) { return all; } + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { + PrepareData(args, buffers, queue); + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = Trsm(args.layout, args.side, args.triangle, args.a_transpose, args.diagonal, + args.m, args.n, args.alpha, + buffers.a_mat(), args.a_offset, args.a_ld, + buffers.b_mat(), args.b_offset, args.b_ld, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + PrepareData(args, buffers, queue); + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = clblasXtrsm(convertToCLBLAS(args.layout), + convertToCLBLAS(args.side), + convertToCLBLAS(args.triangle), + convertToCLBLAS(args.a_transpose), + convertToCLBLAS(args.diagonal), + args.m, args.n, args.alpha, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + return static_cast(status); + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + PrepareData(args, buffers, queue); + std::vector a_mat_cpu(args.a_size, static_cast(0)); + std::vector b_mat_cpu(args.b_size, static_cast(0)); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + buffers.b_mat.Read(queue, args.b_size, b_mat_cpu); + cblasXtrsm(convertToCBLAS(args.layout), + convertToCBLAS(args.side), + convertToCBLAS(args.triangle), + convertToCBLAS(args.a_transpose), + convertToCBLAS(args.diagonal), + args.m, args.n, args.alpha, + a_mat_cpu, args.a_offset, args.a_ld, + b_mat_cpu, args.b_offset, args.b_ld); + buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + return StatusCode::kSuccess; + } + #endif + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.b_size, static_cast(0)); + buffers.b_mat.Read(queue, args.b_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.m; } + static size_t ResultID2(const Arguments &args) { return args.n; } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return (args.layout == Layout::kRowMajor) ? + id1*args.b_ld + id2 + args.b_offset: + id2*args.b_ld + id1 + args.b_offset; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + auto k = (args.side == Side::kLeft) ? args.m : args.n; + return args.m * args.n * k; + } + static size_t GetBytes(const Arguments &args) { + auto k = (args.side == Side::kLeft) ? args.m : args.n; + return (k*k + 2*args.m*args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XTRSM_H_ +#endif diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp new file mode 100644 index 00000000..c6ce4b07 --- /dev/null +++ b/test/routines/levelx/xinvert.hpp @@ -0,0 +1,224 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a class with static methods to describe the Xinvert routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XINVERT_H_ +#define CLBLAST_TEST_ROUTINES_XINVERT_H_ + +#include +#include + +#include "routines/levelx/xinvert.hpp" + +namespace clblast { +// ================================================================================================= + +template +StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &queue) { + const bool is_upper = ((args.triangle == Triangle::kUpper && args.layout != Layout::kRowMajor) || + (args.triangle == Triangle::kLower && args.layout == Layout::kRowMajor)); + + // Data transfer from OpenCL to std::vector + std::vector a_mat_cpu(args.a_size, T{0.0}); + buffers.a_mat.Read(queue, args.a_size, a_mat_cpu); + + // Creates the output buffer + std::vector b_mat_cpu(args.b_size, T{0.0}); + + // Helper variables + const auto block_size = args.m; + const auto num_blocks = CeilDiv(args.n, block_size); + const auto a_ld = args.a_ld; + const auto b_ld = block_size; + + // Checks for valid arguments + if ((block_size == 0) || (args.n == 0)) { + return StatusCode::kInvalidDimension; + } + if ((block_size % 16 != 0) || (block_size > 128)) { + return StatusCode::kUnknownError; + } + + // Loops over the amount of diagonal blocks of size args.m by args.m each + for (auto block_id = size_t{0}; block_id < num_blocks; ++block_id) { + const auto a_offset = block_id * (block_size + a_ld * block_size) + args.a_offset; + const auto b_offset = block_id * block_size * block_size; + + // Inverts the diagonal elements of the matrix + for (auto i = size_t{0}; i < block_size; ++i) { + auto a_value = T{1.0}; + if (args.diagonal == Diagonal::kNonUnit) { + if (i + block_id * block_size < args.n) { + if (a_mat_cpu[i * a_ld + i + a_offset] == T{0.0}) { return StatusCode::kUnknownError; } + a_value = T{1.0} / a_mat_cpu[i * a_ld + i + a_offset]; + } + } + b_mat_cpu[i * b_ld + i + b_offset] = a_value; + } + + // Inverts the upper triangle row by row + if (is_upper) { + for (int i = static_cast(block_size) - 2; i >= 0; --i) { + for (auto j = static_cast(block_size) - 1; j > i; --j) { + auto sum = T{0.0}; + for (auto k = i + 1; k <= j; ++k) { + auto a_value = T{0.0}; + if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) { + a_value = a_mat_cpu[k * a_ld + i + a_offset]; + } + sum += a_value * b_mat_cpu[j * b_ld + k + b_offset]; + } + b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset]; + } + } + } + + // Inverts the lower triangle row by row + else { + for (auto i = size_t{1}; i < block_size; ++i) { + for (auto j = size_t{0}; j < i; ++j) { + auto sum = T{0.0}; + for (auto k = j; k < i; ++k) { + auto a_value = T{0.0}; + if ((i + block_id * block_size < args.n) && (k + block_id * block_size < args.n)) { + a_value = a_mat_cpu[k * a_ld + i + a_offset]; + } + sum += a_value * b_mat_cpu[j * b_ld + k + b_offset]; + } + b_mat_cpu[j * b_ld + i + b_offset] = - sum * b_mat_cpu[i * b_ld + i + b_offset]; + } + } + } + } + + // Data transfer back to OpenCL + buffers.b_mat.Write(queue, args.b_size, b_mat_cpu); + return StatusCode::kSuccess; +} + +// Half-precision version calling the above reference implementation after conversions +template <> +StatusCode RunReference(const Arguments &args, Buffers &buffers, Queue &queue) { + auto a_buffer2 = HalfToFloatBuffer(buffers.a_mat, queue()); + auto b_buffer2 = HalfToFloatBuffer(buffers.b_mat, queue()); + auto dummy = clblast::Buffer(0); + auto buffers2 = Buffers{dummy, dummy, a_buffer2, b_buffer2, dummy, dummy, dummy}; + auto args2 = Arguments(); + args2.a_size = args.a_size; args2.b_size = args.b_size; + args2.a_ld = args.a_ld; args2.m = args.m; args2.n = args.n; + args2.a_offset = args.a_offset; + args2.layout = args.layout; args2.triangle = args.triangle; args2.diagonal = args.diagonal; + auto status = RunReference(args2, buffers2, queue); + FloatToHalfBuffer(buffers.b_mat, b_buffer2, queue()); + return status; +} + +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TestXinvert { + public: + + // The BLAS level: 4 for the extra routines + static size_t BLASLevel() { return 4; } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { + return {kArgN, kArgM, + kArgLayout, kArgTriangle, kArgDiagonal, + kArgALeadDim, kArgAOffset}; + } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeA(const Arguments &args) { + return args.n * args.a_ld + args.a_offset; + } + static size_t GetSizeB(const Arguments &args) { + const auto block_size = args.m; + const auto num_blocks = CeilDiv(args.n, block_size); + return num_blocks * block_size * block_size; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments &args) { + args.a_size = GetSizeA(args); + args.b_size = GetSizeB(args); + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments &args) { return args.n; } + static size_t DefaultLDB(const Arguments &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments &) { return 1; } // N/A for this routine + + // Describes which omatcopyose options are relevant for this routine + using Transposes = std::vector; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments &args, Buffers &buffers, Queue &queue) { + try { + auto event = cl_event{}; + auto inverter = Xinvert(queue, &event); + inverter.InvertMatrixDiagonalBlocks(args.layout, args.triangle, args.diagonal, + args.n, args.m, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat); + clWaitForEvents(1, &event); + clReleaseEvent(event); + } catch (...) { return DispatchException(); } + return StatusCode::kSuccess; + } + + // Describes how to run a naive version of the routine (for correctness/performance comparison). + // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines. + static StatusCode RunReference1(const Arguments &args, Buffers &buffers, Queue &queue) { + return RunReference(args, buffers, queue); + } + + static StatusCode RunReference2(const Arguments &args, Buffers &buffers, Queue &queue) { + return RunReference(args, buffers, queue); + } + + // Describes how to download the results of the computation (more importantly: which buffer) + static std::vector DownloadResult(const Arguments &args, Buffers &buffers, Queue &queue) { + std::vector result(args.b_size, static_cast(0)); + buffers.b_mat.Read(queue, args.b_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments &args) { return args.m; } + static size_t ResultID2(const Arguments &args) { return Ceil(args.n, args.m); } + static size_t GetResultIndex(const Arguments &args, const size_t id1, const size_t id2) { + return id1 * Ceil(args.n, args.m) + id2; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments &args) { + const auto block_size = args.m; + const auto num_blocks = CeilDiv(args.n, block_size); + return num_blocks * (block_size * (block_size / 2) * (block_size / 2)); + } + static size_t GetBytes(const Arguments &args) { + return (args.a_size * args.b_size) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XINVERT_H_ +#endif diff --git a/test/wrapper_cblas.hpp b/test/wrapper_cblas.hpp index 5f1db54e..dd610a6c 100644 --- a/test/wrapper_cblas.hpp +++ b/test/wrapper_cblas.hpp @@ -2103,20 +2103,6 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL reinterpret_cast(&a_buffer[a_offset]), a_ld, reinterpret_cast(&b_buffer[b_offset]), b_ld); } -void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, - const size_t m, const size_t n, - const half alpha, - const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, - std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { - auto a_buffer_bis = HalfToFloatBuffer(a_buffer); - auto b_buffer_bis = HalfToFloatBuffer(b_buffer); - cblasXtrsm(layout, side, triangle, a_transpose, diagonal, - m, n, - HalfToFloat(alpha), - a_buffer_bis, a_offset, a_ld, - b_buffer_bis, b_offset, b_ld); - FloatToHalfBuffer(b_buffer, b_buffer_bis); -} // ================================================================================================= } // namespace clblast diff --git a/test/wrapper_clblas.hpp b/test/wrapper_clblas.hpp index f1923784..f1b3a0c4 100644 --- a/test/wrapper_clblas.hpp +++ b/test/wrapper_clblas.hpp @@ -2865,24 +2865,6 @@ clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } -clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, - const size_t m, const size_t n, - const half alpha, - const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, - Buffer& b_buffer, const size_t b_offset, const size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); - auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); - auto status = clblasXtrsm(layout, side, triangle, a_transpose, diagonal, - m, n, - HalfToFloat(alpha), - a_buffer_bis, a_offset, a_ld, - b_buffer_bis, b_offset, b_ld, - num_queues, queues, num_wait_events, wait_events, events); - FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]); - return status; -} // ================================================================================================= } // namespace clblast