// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements all the BLAS API calls. In all cases, it does not much more than creating // a new object of the appropriate type, and calling the main routine on that object. It forwards // all status codes to the caller. // // ================================================================================================= #include #include "routines/routines.hpp" #include "clblast.h" namespace clblast { // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Generate givens plane rotation: SROTG/DROTG template StatusCode Rotg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Generate modified givens plane rotation: SROTMG/DROTMG template StatusCode Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Apply givens plane rotation: SROT/DROT template StatusCode Rot(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, const T, const T, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rot(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, const float, const float, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rot(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, const double, const double, cl_command_queue*, cl_event*); // Apply modified givens plane rotation: SROTM/DROTM template StatusCode Rotm(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotm(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotm(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xswap(queue_cpp, event); routine.DoSwap(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xscal(queue_cpp, event); routine.DoScal(n, alpha, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Scal(const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xcopy(queue_cpp, event); routine.DoCopy(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template StatusCode Axpy(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xaxpy(queue_cpp, event); routine.DoAxpy(n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Axpy(const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Dot product of two vectors: SDOT/DDOT/HDOT template StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xdot(queue_cpp, event); routine.DoDot(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Dot product of two complex vectors: CDOTU/ZDOTU template StatusCode Dotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xdotu(queue_cpp, event); routine.DoDotu(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dotu(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dotu(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template StatusCode Dotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xdotc(queue_cpp, event); routine.DoDotc(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dotc(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dotc(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xnrm2(queue_cpp, event); routine.DoNrm2(n, Buffer(nrm2_buffer), nrm2_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xasum(queue_cpp, event); routine.DoAsum(n, Buffer(asum_buffer), asum_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsum(queue_cpp, event); routine.DoSum(n, Buffer(sum_buffer), sum_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xamax(queue_cpp, event); routine.DoAmax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN template StatusCode Amin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xamin(queue_cpp, event); routine.DoAmin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Amin(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amin(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amin(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amin(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amin(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xmax(queue_cpp, event); routine.DoMax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xmin(queue_cpp, event); routine.DoMin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgemv(queue_cpp, event); routine.DoGemv(layout, a_transpose, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgbmv(queue_cpp, event); routine.DoGbmv(layout, a_transpose, m, n, kl, ku, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhemv(queue_cpp, event); routine.DoHemv(layout, triangle, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template StatusCode Hbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhbmv(queue_cpp, event); routine.DoHbmv(layout, triangle, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template StatusCode Hpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhpmv(queue_cpp, event); routine.DoHpmv(layout, triangle, n, alpha, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsymv(queue_cpp, event); routine.DoSymv(layout, triangle, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template StatusCode Sbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsbmv(queue_cpp, event); routine.DoSbmv(layout, triangle, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template StatusCode Spmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xspmv(queue_cpp, event); routine.DoSpmv(layout, triangle, n, alpha, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrmv(queue_cpp, event); routine.DoTrmv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtbmv(queue_cpp, event); routine.DoTbmv(layout, triangle, a_transpose, diagonal, n, k, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtpmv(queue_cpp, event); routine.DoTpmv(layout, triangle, a_transpose, diagonal, n, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrsv(queue_cpp, event); routine.DoTrsv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General rank-1 matrix update: SGER/DGER/HGER template StatusCode Ger(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xger(queue_cpp, event); routine.DoGer(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General rank-1 complex matrix update: CGERU/ZGERU template StatusCode Geru(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgeru(queue_cpp, event); routine.DoGeru(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Geru(const Layout, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Geru(const Layout, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template StatusCode Gerc(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgerc(queue_cpp, event); routine.DoGerc(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gerc(const Layout, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gerc(const Layout, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian rank-1 matrix update: CHER/ZHER template StatusCode Her(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xher,T>(queue_cpp, event); routine.DoHer(layout, triangle, n, alpha, Buffer>(x_buffer), x_offset, x_inc, Buffer>(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Her(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template StatusCode Hpr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhpr,T>(queue_cpp, event); routine.DoHpr(layout, triangle, n, alpha, Buffer>(x_buffer), x_offset, x_inc, Buffer>(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Hermitian rank-2 matrix update: CHER2/ZHER2 template StatusCode Her2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xher2(queue_cpp, event); routine.DoHer2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template StatusCode Hpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhpr2(queue_cpp, event); routine.DoHpr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyr(queue_cpp, event); routine.DoSyr(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xspr(queue_cpp, event); routine.DoSpr(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyr2(queue_cpp, event); routine.DoSyr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xspr2(queue_cpp, event); routine.DoSpr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event, cl_mem temp_buffer) { try { auto queue_cpp = Queue(*queue); auto routine = Xgemm(queue_cpp, event); const auto temp_buffer_provided = temp_buffer != nullptr; auto temp_buffer_cpp = temp_buffer_provided ? Buffer(temp_buffer) : Buffer(nullptr); routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld, temp_buffer_cpp, temp_buffer_provided); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*, cl_mem); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*, cl_mem); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*, cl_mem); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*, cl_mem); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*, cl_mem); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsymm(queue_cpp, event); routine.DoSymm(layout, side, triangle, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhemm(queue_cpp, event); routine.DoHemm(layout, side, triangle, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyrk(queue_cpp, event); routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-K update of a hermitian matrix: CHERK/ZHERK template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xherk,T>(queue_cpp, event); routine.DoHerk(layout, triangle, a_transpose, n, k, alpha, Buffer>(a_buffer), a_offset, a_ld, beta, Buffer>(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyr2k(queue_cpp, event); routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xher2k(queue_cpp, event); routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrmm(queue_cpp, event); routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrsm(queue_cpp, event); routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // Extra non-BLAS routines (level-X) // ================================================================================================= // Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD template StatusCode Had(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, const T beta, cl_mem z_buffer, const size_t z_offset, const size_t z_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhad(queue_cpp, event); routine.DoHad(n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, beta, Buffer(z_buffer), z_offset, z_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Had(const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Had(const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Had(const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Had(const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Had(const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY template StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xomatcopy(queue_cpp, event); routine.DoOmatcopy(layout, a_transpose, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL template StatusCode Im2col(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem im_buffer, const size_t im_offset, cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xim2col(queue_cpp, event); routine.DoIm2col(kernel_mode, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, Buffer(im_buffer), im_offset, Buffer(col_buffer), col_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Im2col(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM template StatusCode Col2im(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const cl_mem col_buffer, const size_t col_offset, cl_mem im_buffer, const size_t im_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xcol2im(queue_cpp, event); routine.DoCol2im(kernel_mode, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, Buffer(col_buffer), col_offset, Buffer(im_buffer), im_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Col2im(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM template StatusCode Convgemm(const KernelMode kernel_mode, const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, const cl_mem im_buffer, const size_t im_offset, const cl_mem kernel_buffer, const size_t kernel_offset, cl_mem result_buffer, const size_t result_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xconvgemm(queue_cpp, event); routine.DoConvgemm(kernel_mode, channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, Buffer(im_buffer), im_offset, Buffer(kernel_buffer), kernel_offset, Buffer(result_buffer), result_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Convgemm(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Convgemm(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Convgemm(const KernelMode, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template StatusCode AxpyBatched(const size_t n, const T *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = XaxpyBatched(queue_cpp, event); auto alphas_cpp = std::vector(); auto x_offsets_cpp = std::vector(); auto y_offsets_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); x_offsets_cpp.push_back(x_offsets[batch]); y_offsets_cpp.push_back(y_offsets[batch]); } routine.DoAxpyBatched(n, alphas_cpp, Buffer(x_buffer), x_offsets_cpp, x_inc, Buffer(y_buffer), y_offsets_cpp, y_inc, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API AxpyBatched(const size_t, const float*, const cl_mem, const size_t*, const size_t, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double*, const cl_mem, const size_t*, const size_t, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const float2*, const cl_mem, const size_t*, const size_t, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double2*, const cl_mem, const size_t*, const size_t, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const half*, const cl_mem, const size_t*, const size_t, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED template StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const T *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = XgemmBatched(queue_cpp, event); auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); auto a_offsets_cpp = std::vector(); auto b_offsets_cpp = std::vector(); auto c_offsets_cpp = std::vector(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); betas_cpp.push_back(betas[batch]); a_offsets_cpp.push_back(a_offsets[batch]); b_offsets_cpp.push_back(b_offsets[batch]); c_offsets_cpp.push_back(c_offsets[batch]); } routine.DoGemmBatched(layout, a_transpose, b_transpose, m, n, k, alphas_cpp, Buffer(a_buffer), a_offsets_cpp, a_ld, Buffer(b_buffer), b_offsets_cpp, b_ld, betas_cpp, Buffer(c_buffer), c_offsets_cpp, c_ld, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float*, const cl_mem, const size_t*, const size_t, const cl_mem, const size_t*, const size_t, const float*, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double*, const cl_mem, const size_t*, const size_t, const cl_mem, const size_t*, const size_t, const double*, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2*, const cl_mem, const size_t*, const size_t, const cl_mem, const size_t*, const size_t, const float2*, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2*, const cl_mem, const size_t*, const size_t, const cl_mem, const size_t*, const size_t, const double2*, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half*, const cl_mem, const size_t*, const size_t, const cl_mem, const size_t*, const size_t, const half*, cl_mem, const size_t*, const size_t, const size_t, cl_command_queue*, cl_event*); // StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED template StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = XgemmStridedBatched(queue_cpp, event); routine.DoGemmStridedBatched(layout, a_transpose, b_transpose, m, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, a_stride, Buffer(b_buffer), b_offset, b_ld, b_stride, beta, Buffer(c_buffer), c_offset, c_ld, c_stride, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const size_t, const cl_mem, const size_t, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const size_t, const cl_mem, const size_t, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const size_t, const cl_mem, const size_t, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const size_t, const cl_mem, const size_t, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API GemmStridedBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const size_t, const cl_mem, const size_t, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // Retrieves the required size of the temporary buffer for the GEMM kernel (optional) template StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const size_t a_offset, const size_t a_ld, const size_t b_offset, const size_t b_ld, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, size_t& temp_buffer_size) { try { // Retrieves the tuning database const auto queue_cpp = Queue(*queue); const auto device = queue_cpp.GetDevice(); const auto kernel_names = std::vector{"Xgemm", "GemmRoutine"}; Databases db(kernel_names); Routine::InitDatabase(device, kernel_names, PrecisionValue(), {}, db); // Computes the buffer size if (Xgemm::UseDirectKernel(m, n, k, db["XGEMM_MIN_INDIRECT_SIZE"])) { temp_buffer_size = 0; } else { temp_buffer_size = Xgemm::GetTempSize(layout, a_transpose, b_transpose, m, n, k, a_offset, a_ld, b_offset, b_ld, c_offset, c_ld, db["MWG"], db["NWG"], db["KWG"] * db["KREG"], db["GEMMK"]); } temp_buffer_size *= sizeof(T); // translate from num-elements to bytes return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, size_t&); template StatusCode PUBLIC_API GemmTempBufferSize(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, cl_command_queue*, size_t&); // ================================================================================================= } // namespace clblast