From a17b714c3e2fee6e8c30bc2506eb284d1ee3ce31 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 5 Oct 2016 00:09:39 +0200 Subject: [PATCH 01/15] Added first version of Netlib BLAS API header --- include/clblast_blas.h | 1077 +++++++++++++++++++++++ scripts/generator/generator.py | 12 +- scripts/generator/generator/cpp.py | 40 + scripts/generator/generator/datatype.py | 5 + scripts/generator/generator/routine.py | 59 ++ 5 files changed, 1191 insertions(+), 2 deletions(-) create mode 100644 include/clblast_blas.h mode change 100644 => 100755 scripts/generator/generator.py diff --git a/include/clblast_blas.h b/include/clblast_blas.h new file mode 100644 index 00000000..41b03446 --- /dev/null +++ b/include/clblast_blas.h @@ -0,0 +1,1077 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer +// copies automatically and running on the default OpenCL platform and device. For full control over +// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. +// +// ================================================================================================= + +#ifndef CLBLAST_CLBLAST_BLAS_H_ +#define CLBLAST_CLBLAST_BLAS_H_ + +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#ifdef _WIN32 + #ifdef COMPILING_DLL + #define PUBLIC_API __declspec(dllexport) + #else + #define PUBLIC_API __declspec(dllimport) + #endif +#else + #define PUBLIC_API +#endif + +// The C interface +#ifdef __cplusplus +extern "C" { +#endif + +// ================================================================================================= + +// Matrix layout and transpose types +typedef enum Layout_ { kRowMajor = 101, kColMajor = 102 } Layout; +typedef enum Transpose_ { kNo = 111, kYes = 112, kConjugate = 113 } Transpose; +typedef enum Triangle_ { kUpper = 121, kLower = 122 } Triangle; +typedef enum Diagonal_ { kNonUnit = 131, kUnit = 132 } Diagonal; +typedef enum Side_ { kLeft = 141, kRight = 142 } Side; + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +void PUBLIC_API cblas_srotg(float* sa, + float* sb, + float* sc, + float* ss); +void PUBLIC_API cblas_drotg(double* sa, + double* sb, + double* sc, + double* ss); + +// Generate modified givens plane rotation: SROTMG/DROTMG +void PUBLIC_API cblas_srotmg(float* sd1, + float* sd2, + float* sx1, + const float* sy1, + float* sparam); +void PUBLIC_API cblas_drotmg(double* sd1, + double* sd2, + double* sx1, + const double* sy1, + double* sparam); + +// Apply givens plane rotation: SROT/DROT +void PUBLIC_API cblas_srot(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + const float cos, + const float sin); +void PUBLIC_API cblas_drot(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + const double cos, + const double sin); + +// Apply modified givens plane rotation: SROTM/DROTM +void PUBLIC_API cblas_srotm(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + float* sparam); +void PUBLIC_API cblas_drotm(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + double* sparam); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +void PUBLIC_API cblas_sswap(const int n, + float* x, const int x_inc, + float* y, const int y_inc); +void PUBLIC_API cblas_dswap(const int n, + double* x, const int x_inc, + double* y, const int y_inc); +void PUBLIC_API cblas_cswap(const int n, + float2* x, const int x_inc, + float2* y, const int y_inc); +void PUBLIC_API cblas_zswap(const int n, + double2* x, const int x_inc, + double2* y, const int y_inc); +void PUBLIC_API cblas_hswap(const int n, + half* x, const int x_inc, + half* y, const int y_inc); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +void PUBLIC_API cblas_sscal(const int n, + const float alpha, + float* x, const int x_inc); +void PUBLIC_API cblas_dscal(const int n, + const double alpha, + double* x, const int x_inc); +void PUBLIC_API cblas_cscal(const int n, + const void* alpha, + float2* x, const int x_inc); +void PUBLIC_API cblas_zscal(const int n, + const void* alpha, + double2* x, const int x_inc); +void PUBLIC_API cblas_hscal(const int n, + const void* alpha, + half* x, const int x_inc); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +void PUBLIC_API cblas_scopy(const int n, + const float* x, const int x_inc, + float* y, const int y_inc); +void PUBLIC_API cblas_dcopy(const int n, + const double* x, const int x_inc, + double* y, const int y_inc); +void PUBLIC_API cblas_ccopy(const int n, + const float2* x, const int x_inc, + float2* y, const int y_inc); +void PUBLIC_API cblas_zcopy(const int n, + const double2* x, const int x_inc, + double2* y, const int y_inc); +void PUBLIC_API cblas_hcopy(const int n, + const half* x, const int x_inc, + half* y, const int y_inc); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +void PUBLIC_API cblas_saxpy(const int n, + const float alpha, + const float* x, const int x_inc, + float* y, const int y_inc); +void PUBLIC_API cblas_daxpy(const int n, + const double alpha, + const double* x, const int x_inc, + double* y, const int y_inc); +void PUBLIC_API cblas_caxpy(const int n, + const void* alpha, + const float2* x, const int x_inc, + float2* y, const int y_inc); +void PUBLIC_API cblas_zaxpy(const int n, + const void* alpha, + const double2* x, const int x_inc, + double2* y, const int y_inc); +void PUBLIC_API cblas_haxpy(const int n, + const void* alpha, + const half* x, const int x_inc, + half* y, const int y_inc); + +// Dot product of two vectors: SDOT/DDOT/HDOT +void PUBLIC_API cblas_sdot(const int n, + float* dot, + const float* x, const int x_inc, + const float* y, const int y_inc); +void PUBLIC_API cblas_ddot(const int n, + double* dot, + const double* x, const int x_inc, + const double* y, const int y_inc); +void PUBLIC_API cblas_hdot(const int n, + half* dot, + const half* x, const int x_inc, + const half* y, const int y_inc); + +// Dot product of two complex vectors: CDOTU/ZDOTU +void PUBLIC_API cblas_cdotu(const int n, + float2* dot, + const float2* x, const int x_inc, + const float2* y, const int y_inc); +void PUBLIC_API cblas_zdotu(const int n, + double2* dot, + const double2* x, const int x_inc, + const double2* y, const int y_inc); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +void PUBLIC_API cblas_cdotc(const int n, + float2* dot, + const float2* x, const int x_inc, + const float2* y, const int y_inc); +void PUBLIC_API cblas_zdotc(const int n, + double2* dot, + const double2* x, const int x_inc, + const double2* y, const int y_inc); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +void PUBLIC_API cblas_snrm2(const int n, + float* nrm2, + const float* x, const int x_inc); +void PUBLIC_API cblas_dnrm2(const int n, + double* nrm2, + const double* x, const int x_inc); +void PUBLIC_API cblas_scnrm2(const int n, + float2* nrm2, + const float2* x, const int x_inc); +void PUBLIC_API cblas_dznrm2(const int n, + double2* nrm2, + const double2* x, const int x_inc); +void PUBLIC_API cblas_hnrm2(const int n, + half* nrm2, + const half* x, const int x_inc); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +void PUBLIC_API cblas_sasum(const int n, + float* asum, + const float* x, const int x_inc); +void PUBLIC_API cblas_dasum(const int n, + double* asum, + const double* x, const int x_inc); +void PUBLIC_API cblas_scasum(const int n, + float2* asum, + const float2* x, const int x_inc); +void PUBLIC_API cblas_dzasum(const int n, + double2* asum, + const double2* x, const int x_inc); +void PUBLIC_API cblas_hasum(const int n, + half* asum, + const half* x, const int x_inc); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +void PUBLIC_API cblas_ssum(const int n, + float* sum, + const float* x, const int x_inc); +void PUBLIC_API cblas_dsum(const int n, + double* sum, + const double* x, const int x_inc); +void PUBLIC_API cblas_scsum(const int n, + float2* sum, + const float2* x, const int x_inc); +void PUBLIC_API cblas_dzsum(const int n, + double2* sum, + const double2* x, const int x_inc); +void PUBLIC_API cblas_hsum(const int n, + half* sum, + const half* x, const int x_inc); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +void PUBLIC_API cblas_isamax(const int n, + float* imax, + const float* x, const int x_inc); +void PUBLIC_API cblas_idamax(const int n, + double* imax, + const double* x, const int x_inc); +void PUBLIC_API cblas_icamax(const int n, + float2* imax, + const float2* x, const int x_inc); +void PUBLIC_API cblas_izamax(const int n, + double2* imax, + const double2* x, const int x_inc); +void PUBLIC_API cblas_ihamax(const int n, + half* imax, + const half* x, const int x_inc); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +void PUBLIC_API cblas_ismax(const int n, + float* imax, + const float* x, const int x_inc); +void PUBLIC_API cblas_idmax(const int n, + double* imax, + const double* x, const int x_inc); +void PUBLIC_API cblas_icmax(const int n, + float2* imax, + const float2* x, const int x_inc); +void PUBLIC_API cblas_izmax(const int n, + double2* imax, + const double2* x, const int x_inc); +void PUBLIC_API cblas_ihmax(const int n, + half* imax, + const half* x, const int x_inc); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +void PUBLIC_API cblas_ismin(const int n, + float* imin, + const float* x, const int x_inc); +void PUBLIC_API cblas_idmin(const int n, + double* imin, + const double* x, const int x_inc); +void PUBLIC_API cblas_icmin(const int n, + float2* imin, + const float2* x, const int x_inc); +void PUBLIC_API cblas_izmin(const int n, + double2* imin, + const double2* x, const int x_inc); +void PUBLIC_API cblas_ihmin(const int n, + half* imin, + const half* x, const int x_inc); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +void PUBLIC_API cblas_sgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); +void PUBLIC_API cblas_cgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const float2* a, const int a_ld, + const float2* x, const int x_inc, + const void* beta, + float2* y, const int y_inc); +void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const double2* a, const int a_ld, + const double2* x, const int x_inc, + const void* beta, + double2* y, const int y_inc); +void PUBLIC_API cblas_hgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const half* a, const int a_ld, + const half* x, const int x_inc, + const void* beta, + half* y, const int y_inc); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +void PUBLIC_API cblas_sgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); +void PUBLIC_API cblas_cgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const float2* a, const int a_ld, + const float2* x, const int x_inc, + const void* beta, + float2* y, const int y_inc); +void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const double2* a, const int a_ld, + const double2* x, const int x_inc, + const void* beta, + double2* y, const int y_inc); +void PUBLIC_API cblas_hgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const half* a, const int a_ld, + const half* x, const int x_inc, + const void* beta, + half* y, const int y_inc); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +void PUBLIC_API cblas_chemv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const float2* a, const int a_ld, + const float2* x, const int x_inc, + const void* beta, + float2* y, const int y_inc); +void PUBLIC_API cblas_zhemv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const double2* a, const int a_ld, + const double2* x, const int x_inc, + const void* beta, + double2* y, const int y_inc); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +void PUBLIC_API cblas_chbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const void* alpha, + const float2* a, const int a_ld, + const float2* x, const int x_inc, + const void* beta, + float2* y, const int y_inc); +void PUBLIC_API cblas_zhbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const void* alpha, + const double2* a, const int a_ld, + const double2* x, const int x_inc, + const void* beta, + double2* y, const int y_inc); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +void PUBLIC_API cblas_chpmv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const float2* ap, + const float2* x, const int x_inc, + const void* beta, + float2* y, const int y_inc); +void PUBLIC_API cblas_zhpmv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const double2* ap, + const double2* x, const int x_inc, + const void* beta, + double2* y, const int y_inc); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +void PUBLIC_API cblas_ssymv(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); +void PUBLIC_API cblas_hsymv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const half* a, const int a_ld, + const half* x, const int x_inc, + const void* beta, + half* y, const int y_inc); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +void PUBLIC_API cblas_ssbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); +void PUBLIC_API cblas_hsbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const void* alpha, + const half* a, const int a_ld, + const half* x, const int x_inc, + const void* beta, + half* y, const int y_inc); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +void PUBLIC_API cblas_sspmv(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* ap, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* ap, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); +void PUBLIC_API cblas_hspmv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const half* ap, + const half* x, const int x_inc, + const void* beta, + half* y, const int y_inc); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +void PUBLIC_API cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float2* a, const int a_ld, + float2* x, const int x_inc); +void PUBLIC_API cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double2* a, const int a_ld, + double2* x, const int x_inc); +void PUBLIC_API cblas_htrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const half* a, const int a_ld, + half* x, const int x_inc); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +void PUBLIC_API cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const float2* a, const int a_ld, + float2* x, const int x_inc); +void PUBLIC_API cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const double2* a, const int a_ld, + double2* x, const int x_inc); +void PUBLIC_API cblas_htbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const half* a, const int a_ld, + half* x, const int x_inc); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +void PUBLIC_API cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc); +void PUBLIC_API cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc); +void PUBLIC_API cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float2* ap, + float2* x, const int x_inc); +void PUBLIC_API cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double2* ap, + double2* x, const int x_inc); +void PUBLIC_API cblas_htpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const half* ap, + half* x, const int x_inc); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +void PUBLIC_API cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float2* a, const int a_ld, + float2* x, const int x_inc); +void PUBLIC_API cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double2* a, const int a_ld, + double2* x, const int x_inc); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +void PUBLIC_API cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const float2* a, const int a_ld, + float2* x, const int x_inc); +void PUBLIC_API cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const double2* a, const int a_ld, + double2* x, const int x_inc); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +void PUBLIC_API cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc); +void PUBLIC_API cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc); +void PUBLIC_API cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float2* ap, + float2* x, const int x_inc); +void PUBLIC_API cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double2* ap, + double2* x, const int x_inc); + +// General rank-1 matrix update: SGER/DGER/HGER +void PUBLIC_API cblas_sger(const Layout layout, + const int m, const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld); +void PUBLIC_API cblas_dger(const Layout layout, + const int m, const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld); +void PUBLIC_API cblas_hger(const Layout layout, + const int m, const int n, + const void* alpha, + const half* x, const int x_inc, + const half* y, const int y_inc, + half* a, const int a_ld); + +// General rank-1 complex matrix update: CGERU/ZGERU +void PUBLIC_API cblas_cgeru(const Layout layout, + const int m, const int n, + const void* alpha, + const float2* x, const int x_inc, + const float2* y, const int y_inc, + float2* a, const int a_ld); +void PUBLIC_API cblas_zgeru(const Layout layout, + const int m, const int n, + const void* alpha, + const double2* x, const int x_inc, + const double2* y, const int y_inc, + double2* a, const int a_ld); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +void PUBLIC_API cblas_cgerc(const Layout layout, + const int m, const int n, + const void* alpha, + const float2* x, const int x_inc, + const float2* y, const int y_inc, + float2* a, const int a_ld); +void PUBLIC_API cblas_zgerc(const Layout layout, + const int m, const int n, + const void* alpha, + const double2* x, const int x_inc, + const double2* y, const int y_inc, + double2* a, const int a_ld); + +// Hermitian rank-1 matrix update: CHER/ZHER +void PUBLIC_API cblas_cher(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float2* x, const int x_inc, + float2* a, const int a_ld); +void PUBLIC_API cblas_zher(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double2* x, const int x_inc, + double2* a, const int a_ld); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +void PUBLIC_API cblas_chpr(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float2* x, const int x_inc, + float2* ap); +void PUBLIC_API cblas_zhpr(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double2* x, const int x_inc, + double2* ap); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +void PUBLIC_API cblas_cher2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const float2* x, const int x_inc, + const float2* y, const int y_inc, + float2* a, const int a_ld); +void PUBLIC_API cblas_zher2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const double2* x, const int x_inc, + const double2* y, const int y_inc, + double2* a, const int a_ld); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +void PUBLIC_API cblas_chpr2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const float2* x, const int x_inc, + const float2* y, const int y_inc, + float2* ap); +void PUBLIC_API cblas_zhpr2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const double2* x, const int x_inc, + const double2* y, const int y_inc, + double2* ap); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +void PUBLIC_API cblas_ssyr(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* a, const int a_ld); +void PUBLIC_API cblas_dsyr(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* a, const int a_ld); +void PUBLIC_API cblas_hsyr(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const half* x, const int x_inc, + half* a, const int a_ld); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +void PUBLIC_API cblas_sspr(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* ap); +void PUBLIC_API cblas_dspr(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* ap); +void PUBLIC_API cblas_hspr(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const half* x, const int x_inc, + half* ap); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +void PUBLIC_API cblas_ssyr2(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld); +void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld); +void PUBLIC_API cblas_hsyr2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const half* x, const int x_inc, + const half* y, const int y_inc, + half* a, const int a_ld); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +void PUBLIC_API cblas_sspr2(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* ap); +void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* ap); +void PUBLIC_API cblas_hspr2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const half* x, const int x_inc, + const half* y, const int y_inc, + half* ap); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +void PUBLIC_API cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const float2* a, const int a_ld, + const float2* b, const int b_ld, + const void* beta, + float2* c, const int c_ld); +void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const double2* a, const int a_ld, + const double2* b, const int b_ld, + const void* beta, + double2* c, const int c_ld); +void PUBLIC_API cblas_hgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const half* a, const int a_ld, + const half* b, const int b_ld, + const void* beta, + half* c, const int c_ld); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +void PUBLIC_API cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_csymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const float2* a, const int a_ld, + const float2* b, const int b_ld, + const void* beta, + float2* c, const int c_ld); +void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const double2* a, const int a_ld, + const double2* b, const int b_ld, + const void* beta, + double2* c, const int c_ld); +void PUBLIC_API cblas_hsymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const half* a, const int a_ld, + const half* b, const int b_ld, + const void* beta, + half* c, const int c_ld); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +void PUBLIC_API cblas_chemm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const float2* a, const int a_ld, + const float2* b, const int b_ld, + const void* beta, + float2* c, const int c_ld); +void PUBLIC_API cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const double2* a, const int a_ld, + const double2* b, const int b_ld, + const void* beta, + double2* c, const int c_ld); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +void PUBLIC_API cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const float2* a, const int a_ld, + const void* beta, + float2* c, const int c_ld); +void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const double2* a, const int a_ld, + const void* beta, + double2* c, const int c_ld); +void PUBLIC_API cblas_hsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const half* a, const int a_ld, + const void* beta, + half* c, const int c_ld); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +void PUBLIC_API cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const float alpha, + const float2* a, const int a_ld, + const float beta, + float2* c, const int c_ld); +void PUBLIC_API cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const double alpha, + const double2* a, const int a_ld, + const double beta, + double2* c, const int c_ld); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +void PUBLIC_API cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const float2* a, const int a_ld, + const float2* b, const int b_ld, + const void* beta, + float2* c, const int c_ld); +void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const double2* a, const int a_ld, + const double2* b, const int b_ld, + const void* beta, + double2* c, const int c_ld); +void PUBLIC_API cblas_hsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const half* a, const int a_ld, + const half* b, const int b_ld, + const void* beta, + half* c, const int c_ld); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +void PUBLIC_API cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const float2* a, const int a_ld, + const float2* b, const int b_ld, + const float beta, + float2* c, const int c_ld); +void PUBLIC_API cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const double2* a, const int a_ld, + const double2* b, const int b_ld, + const double beta, + double2* c, const int c_ld); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +void PUBLIC_API cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld); +void PUBLIC_API cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld); +void PUBLIC_API cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const float2* a, const int a_ld, + float2* b, const int b_ld); +void PUBLIC_API cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const double2* a, const int a_ld, + double2* b, const int b_ld); +void PUBLIC_API cblas_htrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const half* a, const int a_ld, + half* b, const int b_ld); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM +void PUBLIC_API cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld); +void PUBLIC_API cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld); +void PUBLIC_API cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const float2* a, const int a_ld, + float2* b, const int b_ld); +void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const double2* a, const int a_ld, + double2* b, const int b_ld); +void PUBLIC_API cblas_htrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const half* a, const int a_ld, + half* b, const int b_ld); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +void PUBLIC_API cblas_somatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld); +void PUBLIC_API cblas_domatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld); +void PUBLIC_API cblas_comatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const float2* a, const int a_ld, + float2* b, const int b_ld); +void PUBLIC_API cblas_zomatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const double2* a, const int a_ld, + double2* b, const int b_ld); +void PUBLIC_API cblas_homatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const half* a, const int a_ld, + half* b, const int b_ld); + half* b, const size_t b_offset, const size_t b_ld); + +// ================================================================================================= + +#ifdef __cplusplus +} // extern "C" +#endif + +// CLBLAST_CLBLAST_BLAS_H_ +#endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py old mode 100644 new mode 100755 index d82b13a6..68ae9cbe --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -12,6 +12,8 @@ # clblast.cpp # clblast_c.h # clblast_c.cpp +# clblast_blas.h +# clblast_blas.cpp # wrapper_clblas.h # wrapper_cblas.h # It also generates the main functions for the correctness and performance tests as found in @@ -30,8 +32,8 @@ from generator.routine import Routine from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU -HEADER_LINES = [96, 73, 97, 22, 29, 41] -FOOTER_LINES = [17, 75, 19, 14, 6, 6] +HEADER_LINES = [96, 73, 97, 22, 29, 41, 43, 1] +FOOTER_LINES = [17, 75, 19, 14, 6, 6, 10, 1] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." @@ -132,6 +134,8 @@ def main(argv): library_root + "/src/clblast_c.cpp", library_root + "/test/wrapper_clblas.hpp", library_root + "/test/wrapper_cblas.hpp", + library_root + "/include/clblast_blas.h", + library_root + "/src/clblast_blas.cpp", ] # Checks whether the command-line arguments are valid; exists otherwise @@ -168,6 +172,10 @@ def main(argv): body += cpp.wrapper_clblas(routine) if i == 5: body += cpp.wrapper_cblas(routine) + if i == 6: + body += cpp.clblast_blas_h(routine) + if i == 7: + body += cpp.clblast_blas_cc(routine) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 427eb180..83ddbcb2 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -90,6 +90,46 @@ def clblast_c_cc(routine): return result +def clblast_blas_h(routine): + """The Netlib CBLAS API header (.h)""" + result = NL + "// " + routine.description + ": " + routine.short_names() + NL + for flavour in routine.flavours: + result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL + return result + + +def clblast_blas_cc(routine): + """The Netlib CBLAS API implementation (.cpp)""" + result = NL + "// " + routine.name.upper() + NL + for flavour in routine.flavours: + template = "<" + flavour.template + ">" if routine.no_scalars() else "" + indent = " " * (26 + routine.length() + len(template)) + result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL + + # Initialize OpenCL + result += " auto platform = Platform(size_t{0});" + NL + result += " auto device = Device(platform, size_t{0});" + NL + result += " auto context = Context(device);" + NL + result += " auto queue = Queue(context, device);" + NL + + # Copy data structures to the device + for name in routine.inputs + routine.outputs: + result += " " + routine.create_buffer(name, flavour.template, "0") + NL + for name in routine.inputs + routine.outputs: + result += " " + routine.write_buffer(name, "0") + NL + + # The function call + result += " auto status = clblast::" + routine.name.capitalize() + template + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)]) + result += "," + NL + indent + "queue, event);" + NL + + # Copy back and clean-up + for name in routine.outputs: + result += " " + routine.read_buffer(name, "0") + NL + result += " return;" + NL + "}" + NL + return result + + def wrapper_clblas(routine): """The wrapper to the reference clBLAS routines (for performance/correctness testing)""" result = "" diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py index 9a6c6c02..29acc744 100644 --- a/scripts/generator/generator/datatype.py +++ b/scripts/generator/generator/datatype.py @@ -65,6 +65,11 @@ class DataType: return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or (scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2])) + def is_non_standard(self, scalar): + """Current scalar is of a non-standard type""" + return ((scalar == "alpha" and self.alpha_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2]) or + (scalar == "beta" and self.beta_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2])) + # Regular data-types H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF) # half (16) diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index a4e682c2..4870b861 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -65,6 +65,21 @@ class Routine: """Distinguish between vectors and matrices""" return ["a", "b", "c", "ap"] + @staticmethod + def create_buffer(name, template, size): + """Creates a new CLCudaAPI buffer""" + return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + size + ");" + + @staticmethod + def write_buffer(name, size): + """Writes to a CLCudaAPI buffer""" + return name + "_buffer.Write(queue, " + size + ", " + name + ");" + + @staticmethod + def read_buffer(name, size): + """Reads from a CLCudaAPI buffer""" + return name + "_buffer.Read(queue, " + size + ", " + name + ");" + def non_index_inputs(self): """Lists of input/output buffers not index (integer)""" buffers = self.inputs[:] # make a copy @@ -163,6 +178,16 @@ class Routine: return [", ".join(a + b + c)] return [] + def buffer_def_pointer(self, name, flavour): + """As above but as plain C pointer""" + prefix = "const " if name in self.inputs else "" + if name in self.inputs or name in self.outputs: + data_type = "void" if flavour.is_non_standard(name) else flavour.buffer_type + a = [prefix + data_type + "* " + name + ""] + c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] + return [", ".join(a + c)] + return [] + def buffer_clcudaapi(self, name): """As above but with CLCudaAPI buffers""" if name in self.inputs or name in self.outputs: @@ -288,6 +313,16 @@ class Routine: return ["const " + flavour.beta_cpp + " " + name] return [] + def scalar_def_void(self, name, flavour): + """Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types""" + if name in self.scalars: + if name == "alpha": + data_type = "void*" if flavour.is_non_standard(name) else flavour.alpha_cpp + return ["const " + data_type + " " + name] + data_type = "void*" if flavour.is_non_standard(name) else flavour.beta_cpp + return ["const " + data_type + " " + name] + return [] + def scalar_type(self, name, flavour): """Retrieves the type of a scalar (alpha/beta)""" if name in self.scalars: @@ -316,6 +351,12 @@ class Routine: return [", ".join(["const size_t " + s for s in self.sizes])] return [] + def sizes_def_netlib(self): + """Retrieves the definition of the sizes (m,n,k) for the CBLAS API""" + if self.sizes: + return [", ".join(["const int " + s for s in self.sizes])] + return [] + def sizes_type(self): """Retrieves the types of the sizes (m,n,k)""" if self.sizes: @@ -453,6 +494,17 @@ class Routine: list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + def arguments_def_netlib(self, flavour): + """As above, but for the Netlib CBLAS API""" + return (self.options_def() + self.sizes_def_netlib() + + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) + + self.scalar_def_void("alpha", flavour) + + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) + + self.scalar_def_void("beta", flavour) + + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) + + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + def arguments_def_wrapper_clblas(self, flavour): """As above, but clBLAS wrapper plain data-types""" return (self.options_def_wrapper_clblas() + self.sizes_def() + @@ -528,6 +580,13 @@ class Routine: result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)" return result + def routine_header_netlib(self, flavour, spaces, extra_qualifier): + """As above, but now for the original Netlib CBLAS API""" + indent = " " * (spaces + self.length()) + result = "void" + extra_qualifier + " cblas_" + flavour.name.lower() + self.name + "(" + result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")" + return result + def routine_header_wrapper_clblas(self, flavour, def_only, spaces): """As above, but now for the clBLAS wrapper""" template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else "" From 8d5747aa54b88812ef4060328e3befdb13f3f45a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 5 Oct 2016 08:23:54 +0200 Subject: [PATCH 02/15] Made non-standard types void-pointers in the Netlib BLAS interface --- include/clblast_blas.h | 538 ++++++++++++------------ scripts/generator/generator/datatype.py | 7 +- scripts/generator/generator/routine.py | 6 +- 3 files changed, 275 insertions(+), 276 deletions(-) diff --git a/include/clblast_blas.h b/include/clblast_blas.h index 41b03446..a5d0cc9c 100644 --- a/include/clblast_blas.h +++ b/include/clblast_blas.h @@ -98,14 +98,14 @@ void PUBLIC_API cblas_dswap(const int n, double* x, const int x_inc, double* y, const int y_inc); void PUBLIC_API cblas_cswap(const int n, - float2* x, const int x_inc, - float2* y, const int y_inc); + void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_zswap(const int n, - double2* x, const int x_inc, - double2* y, const int y_inc); + void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_hswap(const int n, - half* x, const int x_inc, - half* y, const int y_inc); + void* x, const int x_inc, + void* y, const int y_inc); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL void PUBLIC_API cblas_sscal(const int n, @@ -116,13 +116,13 @@ void PUBLIC_API cblas_dscal(const int n, double* x, const int x_inc); void PUBLIC_API cblas_cscal(const int n, const void* alpha, - float2* x, const int x_inc); + void* x, const int x_inc); void PUBLIC_API cblas_zscal(const int n, const void* alpha, - double2* x, const int x_inc); + void* x, const int x_inc); void PUBLIC_API cblas_hscal(const int n, const void* alpha, - half* x, const int x_inc); + void* x, const int x_inc); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY void PUBLIC_API cblas_scopy(const int n, @@ -132,14 +132,14 @@ void PUBLIC_API cblas_dcopy(const int n, const double* x, const int x_inc, double* y, const int y_inc); void PUBLIC_API cblas_ccopy(const int n, - const float2* x, const int x_inc, - float2* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_zcopy(const int n, - const double2* x, const int x_inc, - double2* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_hcopy(const int n, - const half* x, const int x_inc, - half* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY void PUBLIC_API cblas_saxpy(const int n, @@ -152,16 +152,16 @@ void PUBLIC_API cblas_daxpy(const int n, double* y, const int y_inc); void PUBLIC_API cblas_caxpy(const int n, const void* alpha, - const float2* x, const int x_inc, - float2* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_zaxpy(const int n, const void* alpha, - const double2* x, const int x_inc, - double2* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_haxpy(const int n, const void* alpha, - const half* x, const int x_inc, - half* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); // Dot product of two vectors: SDOT/DDOT/HDOT void PUBLIC_API cblas_sdot(const int n, @@ -173,29 +173,29 @@ void PUBLIC_API cblas_ddot(const int n, const double* x, const int x_inc, const double* y, const int y_inc); void PUBLIC_API cblas_hdot(const int n, - half* dot, - const half* x, const int x_inc, - const half* y, const int y_inc); + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU void PUBLIC_API cblas_cdotu(const int n, - float2* dot, - const float2* x, const int x_inc, - const float2* y, const int y_inc); + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); void PUBLIC_API cblas_zdotu(const int n, - double2* dot, - const double2* x, const int x_inc, - const double2* y, const int y_inc); + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC void PUBLIC_API cblas_cdotc(const int n, - float2* dot, - const float2* x, const int x_inc, - const float2* y, const int y_inc); + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); void PUBLIC_API cblas_zdotc(const int n, - double2* dot, - const double2* x, const int x_inc, - const double2* y, const int y_inc); + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 void PUBLIC_API cblas_snrm2(const int n, @@ -205,14 +205,14 @@ void PUBLIC_API cblas_dnrm2(const int n, double* nrm2, const double* x, const int x_inc); void PUBLIC_API cblas_scnrm2(const int n, - float2* nrm2, - const float2* x, const int x_inc); + void* nrm2, + const void* x, const int x_inc); void PUBLIC_API cblas_dznrm2(const int n, - double2* nrm2, - const double2* x, const int x_inc); + void* nrm2, + const void* x, const int x_inc); void PUBLIC_API cblas_hnrm2(const int n, - half* nrm2, - const half* x, const int x_inc); + void* nrm2, + const void* x, const int x_inc); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM void PUBLIC_API cblas_sasum(const int n, @@ -222,14 +222,14 @@ void PUBLIC_API cblas_dasum(const int n, double* asum, const double* x, const int x_inc); void PUBLIC_API cblas_scasum(const int n, - float2* asum, - const float2* x, const int x_inc); + void* asum, + const void* x, const int x_inc); void PUBLIC_API cblas_dzasum(const int n, - double2* asum, - const double2* x, const int x_inc); + void* asum, + const void* x, const int x_inc); void PUBLIC_API cblas_hasum(const int n, - half* asum, - const half* x, const int x_inc); + void* asum, + const void* x, const int x_inc); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM void PUBLIC_API cblas_ssum(const int n, @@ -239,14 +239,14 @@ void PUBLIC_API cblas_dsum(const int n, double* sum, const double* x, const int x_inc); void PUBLIC_API cblas_scsum(const int n, - float2* sum, - const float2* x, const int x_inc); + void* sum, + const void* x, const int x_inc); void PUBLIC_API cblas_dzsum(const int n, - double2* sum, - const double2* x, const int x_inc); + void* sum, + const void* x, const int x_inc); void PUBLIC_API cblas_hsum(const int n, - half* sum, - const half* x, const int x_inc); + void* sum, + const void* x, const int x_inc); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX void PUBLIC_API cblas_isamax(const int n, @@ -256,14 +256,14 @@ void PUBLIC_API cblas_idamax(const int n, double* imax, const double* x, const int x_inc); void PUBLIC_API cblas_icamax(const int n, - float2* imax, - const float2* x, const int x_inc); + void* imax, + const void* x, const int x_inc); void PUBLIC_API cblas_izamax(const int n, - double2* imax, - const double2* x, const int x_inc); + void* imax, + const void* x, const int x_inc); void PUBLIC_API cblas_ihamax(const int n, - half* imax, - const half* x, const int x_inc); + void* imax, + const void* x, const int x_inc); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX void PUBLIC_API cblas_ismax(const int n, @@ -273,14 +273,14 @@ void PUBLIC_API cblas_idmax(const int n, double* imax, const double* x, const int x_inc); void PUBLIC_API cblas_icmax(const int n, - float2* imax, - const float2* x, const int x_inc); + void* imax, + const void* x, const int x_inc); void PUBLIC_API cblas_izmax(const int n, - double2* imax, - const double2* x, const int x_inc); + void* imax, + const void* x, const int x_inc); void PUBLIC_API cblas_ihmax(const int n, - half* imax, - const half* x, const int x_inc); + void* imax, + const void* x, const int x_inc); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN void PUBLIC_API cblas_ismin(const int n, @@ -290,14 +290,14 @@ void PUBLIC_API cblas_idmin(const int n, double* imin, const double* x, const int x_inc); void PUBLIC_API cblas_icmin(const int n, - float2* imin, - const float2* x, const int x_inc); + void* imin, + const void* x, const int x_inc); void PUBLIC_API cblas_izmin(const int n, - double2* imin, - const double2* x, const int x_inc); + void* imin, + const void* x, const int x_inc); void PUBLIC_API cblas_ihmin(const int n, - half* imin, - const half* x, const int x_inc); + void* imin, + const void* x, const int x_inc); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -321,24 +321,24 @@ void PUBLIC_API cblas_dgemv(const Layout layout, const Transpose a_transpose, void PUBLIC_API cblas_cgemv(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - const float2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - float2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - const double2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - double2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_hgemv(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const half* a, const int a_ld, - const half* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - half* y, const int y_inc); + void* y, const int y_inc); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV void PUBLIC_API cblas_sgbmv(const Layout layout, const Transpose a_transpose, @@ -358,72 +358,72 @@ void PUBLIC_API cblas_dgbmv(const Layout layout, const Transpose a_transpose, void PUBLIC_API cblas_cgbmv(const Layout layout, const Transpose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, - const float2* a, const int a_ld, - const float2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - float2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, - const double2* a, const int a_ld, - const double2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - double2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_hgbmv(const Layout layout, const Transpose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, - const half* a, const int a_ld, - const half* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - half* y, const int y_inc); + void* y, const int y_inc); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV void PUBLIC_API cblas_chemv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const float2* a, const int a_ld, - const float2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - float2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_zhemv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const double2* a, const int a_ld, - const double2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - double2* y, const int y_inc); + void* y, const int y_inc); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV void PUBLIC_API cblas_chbmv(const Layout layout, const Triangle triangle, const int n, const int k, const void* alpha, - const float2* a, const int a_ld, - const float2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - float2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_zhbmv(const Layout layout, const Triangle triangle, const int n, const int k, const void* alpha, - const double2* a, const int a_ld, - const double2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - double2* y, const int y_inc); + void* y, const int y_inc); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV void PUBLIC_API cblas_chpmv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const float2* ap, - const float2* x, const int x_inc, + const void* ap, + const void* x, const int x_inc, const void* beta, - float2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_zhpmv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const double2* ap, - const double2* x, const int x_inc, + const void* ap, + const void* x, const int x_inc, const void* beta, - double2* y, const int y_inc); + void* y, const int y_inc); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV void PUBLIC_API cblas_ssymv(const Layout layout, const Triangle triangle, @@ -443,10 +443,10 @@ void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hsymv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* a, const int a_ld, - const half* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - half* y, const int y_inc); + void* y, const int y_inc); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV void PUBLIC_API cblas_ssbmv(const Layout layout, const Triangle triangle, @@ -466,10 +466,10 @@ void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hsbmv(const Layout layout, const Triangle triangle, const int n, const int k, const void* alpha, - const half* a, const int a_ld, - const half* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - half* y, const int y_inc); + void* y, const int y_inc); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV void PUBLIC_API cblas_sspmv(const Layout layout, const Triangle triangle, @@ -489,10 +489,10 @@ void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hspmv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* ap, - const half* x, const int x_inc, + const void* ap, + const void* x, const int x_inc, const void* beta, - half* y, const int y_inc); + void* y, const int y_inc); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV void PUBLIC_API cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -505,16 +505,16 @@ void PUBLIC_API cblas_dtrmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const float2* a, const int a_ld, - float2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const double2* a, const int a_ld, - double2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_htrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const half* a, const int a_ld, - half* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV void PUBLIC_API cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -527,16 +527,16 @@ void PUBLIC_API cblas_dtbmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, const int k, - const float2* a, const int a_ld, - float2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, const int k, - const double2* a, const int a_ld, - double2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_htbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, const int k, - const half* a, const int a_ld, - half* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV void PUBLIC_API cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -549,16 +549,16 @@ void PUBLIC_API cblas_dtpmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const float2* ap, - float2* x, const int x_inc); + const void* ap, + void* x, const int x_inc); void PUBLIC_API cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const double2* ap, - double2* x, const int x_inc); + const void* ap, + void* x, const int x_inc); void PUBLIC_API cblas_htpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const half* ap, - half* x, const int x_inc); + const void* ap, + void* x, const int x_inc); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV void PUBLIC_API cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -571,12 +571,12 @@ void PUBLIC_API cblas_dtrsv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const float2* a, const int a_ld, - float2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const double2* a, const int a_ld, - double2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV void PUBLIC_API cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -589,12 +589,12 @@ void PUBLIC_API cblas_dtbsv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, const int k, - const float2* a, const int a_ld, - float2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, const int k, - const double2* a, const int a_ld, - double2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV void PUBLIC_API cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -607,12 +607,12 @@ void PUBLIC_API cblas_dtpsv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const float2* ap, - float2* x, const int x_inc); + const void* ap, + void* x, const int x_inc); void PUBLIC_API cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const double2* ap, - double2* x, const int x_inc); + const void* ap, + void* x, const int x_inc); // General rank-1 matrix update: SGER/DGER/HGER void PUBLIC_API cblas_sger(const Layout layout, @@ -630,89 +630,89 @@ void PUBLIC_API cblas_dger(const Layout layout, void PUBLIC_API cblas_hger(const Layout layout, const int m, const int n, const void* alpha, - const half* x, const int x_inc, - const half* y, const int y_inc, - half* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); // General rank-1 complex matrix update: CGERU/ZGERU void PUBLIC_API cblas_cgeru(const Layout layout, const int m, const int n, const void* alpha, - const float2* x, const int x_inc, - const float2* y, const int y_inc, - float2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); void PUBLIC_API cblas_zgeru(const Layout layout, const int m, const int n, const void* alpha, - const double2* x, const int x_inc, - const double2* y, const int y_inc, - double2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); // General rank-1 complex conjugated matrix update: CGERC/ZGERC void PUBLIC_API cblas_cgerc(const Layout layout, const int m, const int n, const void* alpha, - const float2* x, const int x_inc, - const float2* y, const int y_inc, - float2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); void PUBLIC_API cblas_zgerc(const Layout layout, const int m, const int n, const void* alpha, - const double2* x, const int x_inc, - const double2* y, const int y_inc, - double2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); // Hermitian rank-1 matrix update: CHER/ZHER void PUBLIC_API cblas_cher(const Layout layout, const Triangle triangle, const int n, - const float alpha, - const float2* x, const int x_inc, - float2* a, const int a_ld); + const void* alpha, + const void* x, const int x_inc, + void* a, const int a_ld); void PUBLIC_API cblas_zher(const Layout layout, const Triangle triangle, const int n, - const double alpha, - const double2* x, const int x_inc, - double2* a, const int a_ld); + const void* alpha, + const void* x, const int x_inc, + void* a, const int a_ld); // Hermitian packed rank-1 matrix update: CHPR/ZHPR void PUBLIC_API cblas_chpr(const Layout layout, const Triangle triangle, const int n, - const float alpha, - const float2* x, const int x_inc, - float2* ap); + const void* alpha, + const void* x, const int x_inc, + void* ap); void PUBLIC_API cblas_zhpr(const Layout layout, const Triangle triangle, const int n, - const double alpha, - const double2* x, const int x_inc, - double2* ap); + const void* alpha, + const void* x, const int x_inc, + void* ap); // Hermitian rank-2 matrix update: CHER2/ZHER2 void PUBLIC_API cblas_cher2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const float2* x, const int x_inc, - const float2* y, const int y_inc, - float2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); void PUBLIC_API cblas_zher2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const double2* x, const int x_inc, - const double2* y, const int y_inc, - double2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 void PUBLIC_API cblas_chpr2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const float2* x, const int x_inc, - const float2* y, const int y_inc, - float2* ap); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap); void PUBLIC_API cblas_zhpr2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const double2* x, const int x_inc, - const double2* y, const int y_inc, - double2* ap); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR void PUBLIC_API cblas_ssyr(const Layout layout, const Triangle triangle, @@ -728,8 +728,8 @@ void PUBLIC_API cblas_dsyr(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hsyr(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* x, const int x_inc, - half* a, const int a_ld); + const void* x, const int x_inc, + void* a, const int a_ld); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR void PUBLIC_API cblas_sspr(const Layout layout, const Triangle triangle, @@ -745,8 +745,8 @@ void PUBLIC_API cblas_dspr(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hspr(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* x, const int x_inc, - half* ap); + const void* x, const int x_inc, + void* ap); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 void PUBLIC_API cblas_ssyr2(const Layout layout, const Triangle triangle, @@ -764,9 +764,9 @@ void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hsyr2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* x, const int x_inc, - const half* y, const int y_inc, - half* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 void PUBLIC_API cblas_sspr2(const Layout layout, const Triangle triangle, @@ -784,9 +784,9 @@ void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hspr2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* x, const int x_inc, - const half* y, const int y_inc, - half* ap); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -810,24 +810,24 @@ void PUBLIC_API cblas_dgemm(const Layout layout, const Transpose a_transpose, co void PUBLIC_API cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const int m, const int n, const int k, const void* alpha, - const float2* a, const int a_ld, - const float2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - float2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const int m, const int n, const int k, const void* alpha, - const double2* a, const int a_ld, - const double2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - double2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_hgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const int m, const int n, const int k, const void* alpha, - const half* a, const int a_ld, - const half* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - half* c, const int c_ld); + void* c, const int c_ld); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM void PUBLIC_API cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, @@ -847,40 +847,40 @@ void PUBLIC_API cblas_dsymm(const Layout layout, const Side side, const Triangle void PUBLIC_API cblas_csymm(const Layout layout, const Side side, const Triangle triangle, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - const float2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - float2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - const double2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - double2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_hsymm(const Layout layout, const Side side, const Triangle triangle, const int m, const int n, const void* alpha, - const half* a, const int a_ld, - const half* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - half* c, const int c_ld); + void* c, const int c_ld); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM void PUBLIC_API cblas_chemm(const Layout layout, const Side side, const Triangle triangle, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - const float2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - float2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - const double2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - double2* c, const int c_ld); + void* c, const int c_ld); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK void PUBLIC_API cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, @@ -898,35 +898,35 @@ void PUBLIC_API cblas_dsyrk(const Layout layout, const Triangle triangle, const void PUBLIC_API cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const int n, const int k, const void* alpha, - const float2* a, const int a_ld, + const void* a, const int a_ld, const void* beta, - float2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const int n, const int k, const void* alpha, - const double2* a, const int a_ld, + const void* a, const int a_ld, const void* beta, - double2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_hsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const int n, const int k, const void* alpha, - const half* a, const int a_ld, + const void* a, const int a_ld, const void* beta, - half* c, const int c_ld); + void* c, const int c_ld); // Rank-K update of a hermitian matrix: CHERK/ZHERK void PUBLIC_API cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const int n, const int k, - const float alpha, - const float2* a, const int a_ld, - const float beta, - float2* c, const int c_ld); + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld); void PUBLIC_API cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const int n, const int k, - const double alpha, - const double2* a, const int a_ld, - const double beta, - double2* c, const int c_ld); + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K void PUBLIC_API cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, @@ -946,40 +946,40 @@ void PUBLIC_API cblas_dsyr2k(const Layout layout, const Triangle triangle, const void PUBLIC_API cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const int n, const int k, const void* alpha, - const float2* a, const int a_ld, - const float2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - float2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const int n, const int k, const void* alpha, - const double2* a, const int a_ld, - const double2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - double2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_hsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const int n, const int k, const void* alpha, - const half* a, const int a_ld, - const half* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - half* c, const int c_ld); + void* c, const int c_ld); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K void PUBLIC_API cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const int n, const int k, const void* alpha, - const float2* a, const int a_ld, - const float2* b, const int b_ld, - const float beta, - float2* c, const int c_ld); + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); void PUBLIC_API cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const int n, const int k, const void* alpha, - const double2* a, const int a_ld, - const double2* b, const int b_ld, - const double beta, - double2* c, const int c_ld); + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM void PUBLIC_API cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -995,18 +995,18 @@ void PUBLIC_API cblas_dtrmm(const Layout layout, const Side side, const Triangle void PUBLIC_API cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - float2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - double2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_htrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const half* a, const int a_ld, - half* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM void PUBLIC_API cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -1022,18 +1022,18 @@ void PUBLIC_API cblas_dtrsm(const Layout layout, const Side side, const Triangle void PUBLIC_API cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - float2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - double2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_htrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const half* a, const int a_ld, - half* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); // ================================================================================================= // Extra non-BLAS routines (level-X) @@ -1053,18 +1053,18 @@ void PUBLIC_API cblas_domatcopy(const Layout layout, const Transpose a_transpose void PUBLIC_API cblas_comatcopy(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - float2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_zomatcopy(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - double2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_homatcopy(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const half* a, const int a_ld, - half* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); half* b, const size_t b_offset, const size_t b_ld); // ================================================================================================= diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py index 29acc744..01f32dd8 100644 --- a/scripts/generator/generator/datatype.py +++ b/scripts/generator/generator/datatype.py @@ -65,10 +65,9 @@ class DataType: return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or (scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2])) - def is_non_standard(self, scalar): - """Current scalar is of a non-standard type""" - return ((scalar == "alpha" and self.alpha_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2]) or - (scalar == "beta" and self.beta_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2])) + def is_non_standard(self): + """Current type is of a non-standard type""" + return self.buffer_type in [D_HALF, D_FLOAT2, D_DOUBLE2] # Regular data-types diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 4870b861..126d64ce 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -182,7 +182,7 @@ class Routine: """As above but as plain C pointer""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: - data_type = "void" if flavour.is_non_standard(name) else flavour.buffer_type + data_type = "void" if flavour.is_non_standard() else flavour.buffer_type a = [prefix + data_type + "* " + name + ""] c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + c)] @@ -317,9 +317,9 @@ class Routine: """Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types""" if name in self.scalars: if name == "alpha": - data_type = "void*" if flavour.is_non_standard(name) else flavour.alpha_cpp + data_type = "void*" if flavour.is_non_standard() else flavour.alpha_cpp return ["const " + data_type + " " + name] - data_type = "void*" if flavour.is_non_standard(name) else flavour.beta_cpp + data_type = "void*" if flavour.is_non_standard() else flavour.beta_cpp return ["const " + data_type + " " + name] return [] From f96fd372bc3087938572ebc55bd1d8e1b7e6f18a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 14:28:52 +0200 Subject: [PATCH 03/15] Added initial version of a Netlib CBLAS implementation. TODO: Set correct buffer sizes --- CMakeLists.txt | 2 + include/clblast_blas.h | 158 +- include/clblast_c.h | 5 - scripts/generator/generator.py | 108 +- scripts/generator/generator/cpp.py | 58 +- scripts/generator/generator/datatype.py | 16 + scripts/generator/generator/routine.py | 59 +- src/clblast_blas.cpp | 4651 +++++++++++++++++++++++ 8 files changed, 4814 insertions(+), 243 deletions(-) create mode 100644 src/clblast_blas.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index f5edbd75..d2034617 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,7 @@ set(PRECISIONS 32 64 3232 6464 16) # Gathers all source-files set(SOURCES + src/clblast_blas.cpp src/database/database.cpp src/routines/common.cpp src/utilities/clblast_exceptions.cpp @@ -213,6 +214,7 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib) install(FILES include/clblast.h DESTINATION include) install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) +install(FILES include/clblast_blas.h DESTINATION include) # Installs the config for find_package in dependent projects install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake) diff --git a/include/clblast_blas.h b/include/clblast_blas.h index a5d0cc9c..b4db4192 100644 --- a/include/clblast_blas.h +++ b/include/clblast_blas.h @@ -18,8 +18,8 @@ // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx -#ifdef _WIN32 - #ifdef COMPILING_DLL +#if defined(_WIN32) && defined(CLBLAST_DLL) + #if defined(COMPILING_DLL) #define PUBLIC_API __declspec(dllexport) #else #define PUBLIC_API __declspec(dllimport) @@ -42,6 +42,7 @@ typedef enum Triangle_ { kUpper = 121, kLower = 122 } Triangle; typedef enum Diagonal_ { kNonUnit = 131, kUnit = 132 } Diagonal; typedef enum Side_ { kLeft = 141, kRight = 142 } Side; + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= @@ -103,9 +104,6 @@ void PUBLIC_API cblas_cswap(const int n, void PUBLIC_API cblas_zswap(const int n, void* x, const int x_inc, void* y, const int y_inc); -void PUBLIC_API cblas_hswap(const int n, - void* x, const int x_inc, - void* y, const int y_inc); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL void PUBLIC_API cblas_sscal(const int n, @@ -120,9 +118,6 @@ void PUBLIC_API cblas_cscal(const int n, void PUBLIC_API cblas_zscal(const int n, const void* alpha, void* x, const int x_inc); -void PUBLIC_API cblas_hscal(const int n, - const void* alpha, - void* x, const int x_inc); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY void PUBLIC_API cblas_scopy(const int n, @@ -137,9 +132,6 @@ void PUBLIC_API cblas_ccopy(const int n, void PUBLIC_API cblas_zcopy(const int n, const void* x, const int x_inc, void* y, const int y_inc); -void PUBLIC_API cblas_hcopy(const int n, - const void* x, const int x_inc, - void* y, const int y_inc); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY void PUBLIC_API cblas_saxpy(const int n, @@ -158,10 +150,6 @@ void PUBLIC_API cblas_zaxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc); -void PUBLIC_API cblas_haxpy(const int n, - const void* alpha, - const void* x, const int x_inc, - void* y, const int y_inc); // Dot product of two vectors: SDOT/DDOT/HDOT void PUBLIC_API cblas_sdot(const int n, @@ -172,10 +160,6 @@ void PUBLIC_API cblas_ddot(const int n, double* dot, const double* x, const int x_inc, const double* y, const int y_inc); -void PUBLIC_API cblas_hdot(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU void PUBLIC_API cblas_cdotu(const int n, @@ -210,9 +194,6 @@ void PUBLIC_API cblas_scnrm2(const int n, void PUBLIC_API cblas_dznrm2(const int n, void* nrm2, const void* x, const int x_inc); -void PUBLIC_API cblas_hnrm2(const int n, - void* nrm2, - const void* x, const int x_inc); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM void PUBLIC_API cblas_sasum(const int n, @@ -227,9 +208,6 @@ void PUBLIC_API cblas_scasum(const int n, void PUBLIC_API cblas_dzasum(const int n, void* asum, const void* x, const int x_inc); -void PUBLIC_API cblas_hasum(const int n, - void* asum, - const void* x, const int x_inc); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM void PUBLIC_API cblas_ssum(const int n, @@ -244,9 +222,6 @@ void PUBLIC_API cblas_scsum(const int n, void PUBLIC_API cblas_dzsum(const int n, void* sum, const void* x, const int x_inc); -void PUBLIC_API cblas_hsum(const int n, - void* sum, - const void* x, const int x_inc); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX void PUBLIC_API cblas_isamax(const int n, @@ -261,9 +236,6 @@ void PUBLIC_API cblas_icamax(const int n, void PUBLIC_API cblas_izamax(const int n, void* imax, const void* x, const int x_inc); -void PUBLIC_API cblas_ihamax(const int n, - void* imax, - const void* x, const int x_inc); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX void PUBLIC_API cblas_ismax(const int n, @@ -278,9 +250,6 @@ void PUBLIC_API cblas_icmax(const int n, void PUBLIC_API cblas_izmax(const int n, void* imax, const void* x, const int x_inc); -void PUBLIC_API cblas_ihmax(const int n, - void* imax, - const void* x, const int x_inc); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN void PUBLIC_API cblas_ismin(const int n, @@ -295,9 +264,6 @@ void PUBLIC_API cblas_icmin(const int n, void PUBLIC_API cblas_izmin(const int n, void* imin, const void* x, const int x_inc); -void PUBLIC_API cblas_ihmin(const int n, - void* imin, - const void* x, const int x_inc); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -332,13 +298,6 @@ void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_hgemv(const Layout layout, const Transpose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV void PUBLIC_API cblas_sgbmv(const Layout layout, const Transpose a_transpose, @@ -369,13 +328,6 @@ void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_hgbmv(const Layout layout, const Transpose a_transpose, - const int m, const int n, const int kl, const int ku, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV void PUBLIC_API cblas_chemv(const Layout layout, const Triangle triangle, @@ -440,13 +392,6 @@ void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_hsymv(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV void PUBLIC_API cblas_ssbmv(const Layout layout, const Triangle triangle, @@ -463,13 +408,6 @@ void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_hsbmv(const Layout layout, const Triangle triangle, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV void PUBLIC_API cblas_sspmv(const Layout layout, const Triangle triangle, @@ -486,13 +424,6 @@ void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_hspmv(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* ap, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV void PUBLIC_API cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -511,10 +442,6 @@ void PUBLIC_API cblas_ztrmv(const Layout layout, const Triangle triangle, const const int n, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_htrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV void PUBLIC_API cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -533,10 +460,6 @@ void PUBLIC_API cblas_ztbmv(const Layout layout, const Triangle triangle, const const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_htbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV void PUBLIC_API cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -555,10 +478,6 @@ void PUBLIC_API cblas_ztpmv(const Layout layout, const Triangle triangle, const const int n, const void* ap, void* x, const int x_inc); -void PUBLIC_API cblas_htpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV void PUBLIC_API cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -627,12 +546,6 @@ void PUBLIC_API cblas_dger(const Layout layout, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld); -void PUBLIC_API cblas_hger(const Layout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); // General rank-1 complex matrix update: CGERU/ZGERU void PUBLIC_API cblas_cgeru(const Layout layout, @@ -725,11 +638,6 @@ void PUBLIC_API cblas_dsyr(const Layout layout, const Triangle triangle, const double alpha, const double* x, const int x_inc, double* a, const int a_ld); -void PUBLIC_API cblas_hsyr(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - void* a, const int a_ld); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR void PUBLIC_API cblas_sspr(const Layout layout, const Triangle triangle, @@ -742,11 +650,6 @@ void PUBLIC_API cblas_dspr(const Layout layout, const Triangle triangle, const double alpha, const double* x, const int x_inc, double* ap); -void PUBLIC_API cblas_hspr(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - void* ap); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 void PUBLIC_API cblas_ssyr2(const Layout layout, const Triangle triangle, @@ -761,12 +664,6 @@ void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld); -void PUBLIC_API cblas_hsyr2(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 void PUBLIC_API cblas_sspr2(const Layout layout, const Triangle triangle, @@ -781,12 +678,6 @@ void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double* y, const int y_inc, double* ap); -void PUBLIC_API cblas_hspr2(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* ap); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -821,13 +712,6 @@ void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, co const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const int m, const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM void PUBLIC_API cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, @@ -858,13 +742,6 @@ void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hsymm(const Layout layout, const Side side, const Triangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM void PUBLIC_API cblas_chemm(const Layout layout, const Side side, const Triangle triangle, @@ -907,12 +784,6 @@ void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const const void* a, const int a_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* beta, - void* c, const int c_ld); // Rank-K update of a hermitian matrix: CHERK/ZHERK void PUBLIC_API cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, @@ -957,13 +828,6 @@ void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K void PUBLIC_API cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, @@ -1002,11 +866,6 @@ void PUBLIC_API cblas_ztrmm(const Layout layout, const Side side, const Triangle const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_htrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM void PUBLIC_API cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -1029,11 +888,6 @@ void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_htrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); // ================================================================================================= // Extra non-BLAS routines (level-X) @@ -1060,12 +914,6 @@ void PUBLIC_API cblas_zomatcopy(const Layout layout, const Transpose a_transpose const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_homatcopy(const Layout layout, const Transpose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); - half* b, const size_t b_offset, const size_t b_ld); // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index 81f093cd..72f50d83 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -117,11 +117,6 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, CLBlastDiagonalUnit = 132 } CLBlastDiagonal; typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; -// Precision scoped enum (values in bits) -typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32, - CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232, - CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision; - // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 220b314d..4ba97ff8 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,8 +41,8 @@ FILES = [ "/include/clblast_blas.h", "/src/clblast_blas.cpp", ] -HEADER_LINES = [117, 73, 118, 22, 29, 41, 43, 1] -FOOTER_LINES = [17, 80, 19, 18, 6, 6, 10, 1] +HEADER_LINES = [117, 73, 118, 22, 29, 41, 44, 32] +FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 3] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." @@ -64,65 +64,65 @@ cld_n = "The value of `c_ld` must be at least `n`." # Populates a list of routines ROUTINES = [ [ # Level 1: vector-vector - Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), - Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), - Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), + Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], ["n","n","1"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), - Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), + Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), + Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), ], [ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), - Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), - Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), + Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), + Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), + Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Solves a triangular system of equations", "", []), ], [ # Level X: extra routines (not part of BLAS) - Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), + Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), ]] diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 61730fdb..23a2207c 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -99,7 +99,8 @@ def clblast_blas_h(routine): """The Netlib CBLAS API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL for flavour in routine.flavours: - result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL + if flavour.precision_name in ["S", "D", "C", "Z"]: + result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL return result @@ -107,31 +108,44 @@ def clblast_blas_cc(routine): """The Netlib CBLAS API implementation (.cpp)""" result = NL + "// " + routine.name.upper() + NL for flavour in routine.flavours: - template = "<" + flavour.template + ">" if routine.no_scalars() else "" - indent = " " * (26 + routine.length() + len(template)) - result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL - # Initialize OpenCL - result += " auto platform = Platform(size_t{0});" + NL - result += " auto device = Device(platform, size_t{0});" + NL - result += " auto context = Context(device);" + NL - result += " auto queue = Queue(context, device);" + NL + # There is a version available in CBLAS + if flavour.precision_name in ["S", "D", "C", "Z"]: + template = "<" + flavour.template + ">" if routine.no_scalars() else "" + indent = " " * (12 + routine.length() + len(template)) + result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL - # Copy data structures to the device - for name in routine.inputs + routine.outputs: - result += " " + routine.create_buffer(name, flavour.template, "0") + NL - for name in routine.inputs + routine.outputs: - result += " " + routine.write_buffer(name, "0") + NL + # Initialize OpenCL + result += " auto device = get_device();" + NL + result += " auto context = Context(device);" + NL + result += " auto queue = Queue(context, device);" + NL - # The function call - result += " auto status = clblast::" + routine.name.capitalize() + template + "(" - result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)]) - result += "," + NL + indent + "queue, event);" + NL + # Set alpha and beta + result += "".join(" " + s + NL for s in routine.scalar_create_cpp(flavour)) - # Copy back and clean-up - for name in routine.outputs: - result += " " + routine.read_buffer(name, "0") + NL - result += " return;" + NL + "}" + NL + # Copy data structures to the device + for i, name in enumerate(routine.inputs + routine.outputs): + result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL + result += " " + routine.create_buffer(name, flavour.buffer_type) + NL + for name in routine.inputs + routine.outputs: + prefix = "" if name in routine.outputs else "const " + result += " " + routine.write_buffer(name, prefix + flavour.buffer_type) + NL + + # The function call + result += " auto queue_cl = queue();" + NL + result += " auto s = " + routine.name.capitalize() + template + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)]) + result += "," + NL + indent + "&queue_cl);" + NL + + # Error handling + result += " if (s != StatusCode::kSuccess) {" + NL + result += " throw std::runtime_error(\"CLBlast returned with error code \" + ToString(s));" + NL + result += " }" + NL + + # Copy back and clean-up + for name in routine.outputs: + result += " " + routine.read_buffer(name, flavour.buffer_type) + NL + result += "}" + NL return result diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py index 01f32dd8..98874174 100644 --- a/scripts/generator/generator/datatype.py +++ b/scripts/generator/generator/datatype.py @@ -54,6 +54,22 @@ class DataType: return self.beta_cl + "{{beta.real(), beta.imag()}}" return "beta" + def use_alpha_clblast(self): + """Transforms a Netlib CBLAS parameter to CLBlast style""" + if self.alpha_cpp == D_FLOAT2: + return self.alpha_cpp + "{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}" + elif self.alpha_cpp == D_DOUBLE2: + return self.alpha_cpp + "{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}" + return "alpha" + + def use_beta_clblast(self): + """As above, but for beta instead of alpha""" + if self.beta_cpp == D_FLOAT2: + return self.beta_cpp + "{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}" + elif self.beta_cpp == D_DOUBLE2: + return self.beta_cpp + "{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}" + return "beta" + def test_template(self): """Returns the template as used in the correctness/performance tests""" if self.buffer_type != self.beta_cpp: diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 795fc532..b988c91a 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -13,7 +13,8 @@ import generator.convert as convert class Routine: """Class holding routine-specific information (e.g. name, which arguments, which precisions)""" def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options, - inputs, outputs, scalars, scratch, description, details, requirements): + inputs, outputs, buffer_sizes, scalars, scratch, + description, details, requirements): self.implemented = implemented self.has_tests = has_tests self.level = level @@ -24,6 +25,7 @@ class Routine: self.options = options self.inputs = inputs self.outputs = outputs + self.buffer_sizes = buffer_sizes self.scalars = scalars self.scratch = scratch # Scratch buffer (e.g. for xDOT) self.description = description @@ -66,19 +68,26 @@ class Routine: return ["a", "b", "c", "ap"] @staticmethod - def create_buffer(name, template, size): + def set_size(name, size): + """Sets the size of a buffer""" + return "const auto " + name + "_size = " + size + ";" + + @staticmethod + def create_buffer(name, template): """Creates a new CLCudaAPI buffer""" - return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + size + ");" + return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + name + "_size);" @staticmethod - def write_buffer(name, size): + def write_buffer(name, template): """Writes to a CLCudaAPI buffer""" - return name + "_buffer.Write(queue, " + size + ", " + name + ");" + data_structure = "reinterpret_cast<" + template + "*>(" + name + ")" + return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");" @staticmethod - def read_buffer(name, size): + def read_buffer(name, template): """Reads from a CLCudaAPI buffer""" - return name + "_buffer.Read(queue, " + size + ", " + name + ");" + data_structure = "reinterpret_cast<" + template + "*>(" + name + ")" + return name + "_buffer.Read(queue, " + name + "_size, " + data_structure + ");" def non_index_inputs(self): """Lists of input/output buffers not index (integer)""" @@ -148,6 +157,15 @@ class Routine: return [", ".join(a + b + c)] return [] + def buffer_zero_offset(self, name): + """As above, but with an offset value of zero""" + if name in self.inputs or name in self.outputs: + a = [name + "_buffer()"] + b = ["0"] + c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] + return [", ".join(a + b + c)] + return [] + def buffer_def(self, name): """As above but with data-types""" prefix = "const " if name in self.inputs else "" @@ -263,6 +281,12 @@ class Routine: return [name] return [] + def scalar_cpp(self, name): + """As above, but with _cpp as a suffix""" + if name in self.scalars: + return [name + "_cpp"] + return [] + def scalar_half_to_float(self, name): """As above, but converts from float to half""" if name in self.scalars: @@ -339,6 +363,16 @@ class Routine: return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."] return [] + def scalar_create_cpp(self, flavour): + """Creates a C++ version of a scalar based on a void*""" + result = [] + for name in self.scalars: + if name == "alpha": + result.append("const auto alpha_cpp = " + flavour.use_alpha_clblast() + ";") + elif name == "beta": + result.append("const auto beta_cpp = " + flavour.use_beta_clblast() + ";") + return result + def sizes_list(self): """Retrieves a list of comma-separated sizes (m, n, k)""" if self.sizes: @@ -469,6 +503,17 @@ class Routine: list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()]))) + def arguments_netlib(self, flavour, indent): + """As above, but for the Netlib CBLAS API""" + return (self.options_cast(indent) + self.sizes_list() + + list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_first()])) + + self.scalar_cpp("alpha") + + list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_first()])) + + self.scalar_cpp("beta") + + list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) + def arguments_wrapper_clblas(self, flavour): """As above, but for the clBLAS wrapper""" return (self.options_list() + self.sizes_list() + diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp new file mode 100644 index 00000000..286b1ba8 --- /dev/null +++ b/src/clblast_blas.cpp @@ -0,0 +1,4651 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Netlib CBLAS implementations to the CLBlast BLAS routines, performing buffer +// copies automatically and running on the default OpenCL platform and device. For full control over +// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. +// +// ================================================================================================= + +#include + +#include "clblast_blas.h" +#include "clblast.h" +#include "utilities/utilities.hpp" + +namespace clblast { + +// ================================================================================================= + +// Helper function to get a default OpenCL platform and device +Device get_device() { + auto platform_id = ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); + auto device_id = ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}); + auto platform = Platform(platform_id); + return Device(platform, device_id); +} + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// ROTG +void cblas_srotg(float* sa, + float* sb, + float* sc, + float* ss) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sa_size = 1; + auto sa_buffer = Buffer(context, sa_size); + const auto sb_size = 1; + auto sb_buffer = Buffer(context, sb_size); + const auto sc_size = 1; + auto sc_buffer = Buffer(context, sc_size); + const auto ss_size = 1; + auto ss_buffer = Buffer(context, ss_size); + sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); + auto queue_cl = queue(); + auto s = Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); +} +void cblas_drotg(double* sa, + double* sb, + double* sc, + double* ss) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sa_size = 1; + auto sa_buffer = Buffer(context, sa_size); + const auto sb_size = 1; + auto sb_buffer = Buffer(context, sb_size); + const auto sc_size = 1; + auto sc_buffer = Buffer(context, sc_size); + const auto ss_size = 1; + auto ss_buffer = Buffer(context, ss_size); + sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); + auto queue_cl = queue(); + auto s = Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); +} + +// ROTMG +void cblas_srotmg(float* sd1, + float* sd2, + float* sx1, + const float* sy1, + float* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sy1_size = 1; + auto sy1_buffer = Buffer(context, sy1_size); + const auto sd1_size = 1; + auto sd1_buffer = Buffer(context, sd1_size); + const auto sd2_size = 1; + auto sd2_buffer = Buffer(context, sd2_size); + const auto sx1_size = 1; + auto sx1_buffer = Buffer(context, sx1_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} +void cblas_drotmg(double* sd1, + double* sd2, + double* sx1, + const double* sy1, + double* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sy1_size = 1; + auto sy1_buffer = Buffer(context, sy1_size); + const auto sd1_size = 1; + auto sd1_buffer = Buffer(context, sd1_size); + const auto sd2_size = 1; + auto sd2_buffer = Buffer(context, sd2_size); + const auto sx1_size = 1; + auto sx1_buffer = Buffer(context, sx1_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} + +// ROT +void cblas_srot(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + const float cos, + const float sin) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_drot(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + const double cos, + const double sin) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// ROTM +void cblas_srotm(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + float* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} +void cblas_drotm(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + double* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} + +// SWAP +void cblas_sswap(const int n, + float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dswap(const int n, + double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SCAL +void cblas_sscal(const int n, + const float alpha, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dscal(const int n, + const double alpha, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_cscal(const int n, + const void* alpha, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_zscal(const int n, + const void* alpha, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// COPY +void cblas_scopy(const int n, + const float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dcopy(const int n, + const double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_ccopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zcopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// AXPY +void cblas_saxpy(const int n, + const float alpha, + const float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_daxpy(const int n, + const double alpha, + const double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_caxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zaxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// DOT +void cblas_sdot(const int n, + float* dot, + const float* x, const int x_inc, + const float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_ddot(const int n, + double* dot, + const double* x, const int x_inc, + const double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// DOTU +void cblas_cdotu(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_zdotu(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// DOTC +void cblas_cdotc(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_zdotc(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// NRM2 +void cblas_snrm2(const int n, + float* nrm2, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_dnrm2(const int n, + double* nrm2, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_scnrm2(const int n, + void* nrm2, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_dznrm2(const int n, + void* nrm2, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} + +// ASUM +void cblas_sasum(const int n, + float* asum, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_dasum(const int n, + double* asum, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_scasum(const int n, + void* asum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_dzasum(const int n, + void* asum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} + +// SUM +void cblas_ssum(const int n, + float* sum, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_dsum(const int n, + double* sum, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_scsum(const int n, + void* sum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_dzsum(const int n, + void* sum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} + +// AMAX +void cblas_isamax(const int n, + float* imax, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_idamax(const int n, + double* imax, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_icamax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_izamax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} + +// MAX +void cblas_ismax(const int n, + float* imax, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_idmax(const int n, + double* imax, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_icmax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_izmax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} + +// MIN +void cblas_ismin(const int n, + float* imin, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_idmin(const int n, + double* imin, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_icmin(const int n, + void* imin, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_izmin(const int n, + void* imin, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// GEMV +void cblas_sgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// GBMV +void cblas_sgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HEMV +void cblas_chemv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhemv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HBMV +void cblas_chbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HPMV +void cblas_chpmv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhpmv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SYMV +void cblas_ssymv(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dsymv(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SBMV +void cblas_ssbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dsbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SPMV +void cblas_sspmv(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* ap, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dspmv(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* ap, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// TRMV +void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TBMV +void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TPMV +void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TRSV +void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TBSV +void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TPSV +void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// GER +void cblas_sger(const Layout layout, + const int m, const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dger(const Layout layout, + const int m, const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// GERU +void cblas_cgeru(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zgeru(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// GERC +void cblas_cgerc(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zgerc(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HER +void cblas_cher(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zher(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HPR +void cblas_chpr(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_zhpr(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// HER2 +void cblas_cher2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zher2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HPR2 +void cblas_chpr2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_zhpr2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// SYR +void cblas_ssyr(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dsyr(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// SPR +void cblas_sspr(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_dspr(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// SYR2 +void cblas_ssyr2(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dsyr2(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// SPR2 +void cblas_sspr2(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_dspr2(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// GEMM +void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYMM +void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HEMM +void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYRK +void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HERK +void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYR2K +void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HER2K +void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// TRMM +void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// TRSM +void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// OMATCOPY +void cblas_somatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_domatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_comatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// ================================================================================================= +} // namespace clblast From 59183b7d79b70d918562d5048e521633d425ca1c Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 19:21:49 +0200 Subject: [PATCH 04/15] Sets the proper sizes for the buffers for the Netlib CBLAS API --- scripts/generator/generator.py | 127 ++++++--- src/clblast_blas.cpp | 500 ++++++++++++++++----------------- 2 files changed, 331 insertions(+), 296 deletions(-) diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 4ba97ff8..99edf355 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -59,6 +59,41 @@ bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at leas cld_m = "The value of `c_ld` must be at least `m`." cld_n = "The value of `c_ld` must be at least `n`." + +# Helper functions to compute vector and matrix sizes +def size_helper(condition, size_one, size_two, multiplier): + length = "(" + condition + ")" + " ? " + size_one + " * " + multiplier + " : " + size_two + " * " + multiplier + return length + + +def layout_transpose_condition(prefix): + return "(layout == Layout::kColMajor && " + prefix + "_transpose != Transpose::kNo) || " +\ + "(layout == Layout::kRowMajor && " + prefix + "_transpose == Transpose::kNo)" + + +# Different possibilities for the vector and matrix sizes +xn = "n * x_inc" +xm = "m * x_inc" +yn = "n * y_inc" +ym = "m * y_inc" +an = "n * a_ld" +apn = "((n*(n+1)) / 2)" +cn = "n * c_ld" +xmn = size_helper("a_transpose != Transpose::kNo", "m", "n", "x_inc") +ynm = size_helper("a_transpose != Transpose::kNo", "n", "m", "y_inc") +amn = size_helper("layout == Layout::kRowMajor", "m", "n", "a_ld") +amns = size_helper("side == Side::kLeft", "m", "n", "a_ld") +amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld") +ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld") +ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld") +bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld") +bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld") +bmn = size_helper("layout == Layout::kRowMajor", "m", "n", "b_ld") +bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld") +cmn = size_helper("layout == Layout::kRowMajor", "m", "n", "c_ld") +ammn = size_helper("layout == Layout::kRowMajor", "m", "((side == Side::kLeft) ? m : n)", "a_ld") +bmnn = size_helper("layout == Layout::kRowMajor", "((side == Side::kLeft) ? m : n)", "n", "b_ld") + # ================================================================================================== # Populates a list of routines @@ -66,63 +101,63 @@ ROUTINES = [ [ # Level 1: vector-vector Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"],"", "Apply givens plane rotation", "", []), Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], ["n","n","1"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), - Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a triangular system of equations", "", []), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "", "Solves a packed triangular system of equations", "", []), + Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), + Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), ], [ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), - Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), - Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Solves a triangular system of equations", "", []), + Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), + Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), + Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "", []), ], [ # Level X: extra routines (not part of BLAS) - Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), + Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), ]] diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp index 286b1ba8..b5451049 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_blas.cpp @@ -1390,11 +1390,11 @@ void cblas_sgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1426,11 +1426,11 @@ void cblas_dgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1462,11 +1462,11 @@ void cblas_cgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1498,11 +1498,11 @@ void cblas_zgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1536,11 +1536,11 @@ void cblas_sgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1572,11 +1572,11 @@ void cblas_dgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1608,11 +1608,11 @@ void cblas_cgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1644,11 +1644,11 @@ void cblas_zgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1682,11 +1682,11 @@ void cblas_chemv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1718,11 +1718,11 @@ void cblas_zhemv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1756,11 +1756,11 @@ void cblas_chbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1792,11 +1792,11 @@ void cblas_zhbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1830,11 +1830,11 @@ void cblas_chpmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1866,11 +1866,11 @@ void cblas_zhpmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1904,11 +1904,11 @@ void cblas_ssymv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1940,11 +1940,11 @@ void cblas_dsymv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1978,11 +1978,11 @@ void cblas_ssbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2014,11 +2014,11 @@ void cblas_dsbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2052,11 +2052,11 @@ void cblas_sspmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2088,11 +2088,11 @@ void cblas_dspmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2121,9 +2121,9 @@ void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2148,9 +2148,9 @@ void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2175,9 +2175,9 @@ void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2202,9 +2202,9 @@ void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2231,9 +2231,9 @@ void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2258,9 +2258,9 @@ void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2285,9 +2285,9 @@ void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2312,9 +2312,9 @@ void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2341,9 +2341,9 @@ void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2368,9 +2368,9 @@ void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2395,9 +2395,9 @@ void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2422,9 +2422,9 @@ void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2451,9 +2451,9 @@ void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2478,9 +2478,9 @@ void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2505,9 +2505,9 @@ void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2532,9 +2532,9 @@ void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2561,9 +2561,9 @@ void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2588,9 +2588,9 @@ void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2615,9 +2615,9 @@ void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2642,9 +2642,9 @@ void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2671,9 +2671,9 @@ void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2698,9 +2698,9 @@ void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2725,9 +2725,9 @@ void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2752,9 +2752,9 @@ void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2784,11 +2784,11 @@ void cblas_sger(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2816,11 +2816,11 @@ void cblas_dger(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2850,11 +2850,11 @@ void cblas_cgeru(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2882,11 +2882,11 @@ void cblas_zgeru(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2916,11 +2916,11 @@ void cblas_cgerc(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2948,11 +2948,11 @@ void cblas_zgerc(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2981,9 +2981,9 @@ void cblas_cher(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3009,9 +3009,9 @@ void cblas_zher(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3039,9 +3039,9 @@ void cblas_chpr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3067,9 +3067,9 @@ void cblas_zhpr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3098,11 +3098,11 @@ void cblas_cher2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3131,11 +3131,11 @@ void cblas_zher2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3166,11 +3166,11 @@ void cblas_chpr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3199,11 +3199,11 @@ void cblas_zhpr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3233,9 +3233,9 @@ void cblas_ssyr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3261,9 +3261,9 @@ void cblas_dsyr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3291,9 +3291,9 @@ void cblas_sspr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3319,9 +3319,9 @@ void cblas_dspr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3350,11 +3350,11 @@ void cblas_ssyr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3383,11 +3383,11 @@ void cblas_dsyr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3418,11 +3418,11 @@ void cblas_sspr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3451,11 +3451,11 @@ void cblas_dspr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3492,11 +3492,11 @@ void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3529,11 +3529,11 @@ void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3566,11 +3566,11 @@ void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3603,11 +3603,11 @@ void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3642,11 +3642,11 @@ void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3679,11 +3679,11 @@ void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3716,11 +3716,11 @@ void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3753,11 +3753,11 @@ void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3792,11 +3792,11 @@ void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3829,11 +3829,11 @@ void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3867,9 +3867,9 @@ void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3899,9 +3899,9 @@ void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3931,9 +3931,9 @@ void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3963,9 +3963,9 @@ void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3997,9 +3997,9 @@ void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -4029,9 +4029,9 @@ void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -4064,11 +4064,11 @@ void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4101,11 +4101,11 @@ void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4138,11 +4138,11 @@ void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4175,11 +4175,11 @@ void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4214,11 +4214,11 @@ void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4251,11 +4251,11 @@ void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4287,9 +4287,9 @@ void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4318,9 +4318,9 @@ void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4349,9 +4349,9 @@ void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4380,9 +4380,9 @@ void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4413,9 +4413,9 @@ void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4444,9 +4444,9 @@ void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4475,9 +4475,9 @@ void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4506,9 +4506,9 @@ void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4543,9 +4543,9 @@ void cblas_somatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4571,9 +4571,9 @@ void cblas_domatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4599,9 +4599,9 @@ void cblas_comatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4627,9 +4627,9 @@ void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); From 926aca53a0de9250a9f7d42026fb54995668dc5b Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 19:45:57 +0200 Subject: [PATCH 05/15] Made the Netlib CBLAS API use the same enums with prefixes as the regular C API of CLBlast --- include/clblast_blas.h | 215 ++++---- scripts/generator/generator.py | 22 +- scripts/generator/generator/cpp.py | 1 + scripts/generator/generator/routine.py | 2 +- src/clblast_blas.cpp | 732 ++++++++++++------------- 5 files changed, 488 insertions(+), 484 deletions(-) diff --git a/include/clblast_blas.h b/include/clblast_blas.h index b4db4192..927f84cd 100644 --- a/include/clblast_blas.h +++ b/include/clblast_blas.h @@ -36,12 +36,15 @@ extern "C" { // ================================================================================================= // Matrix layout and transpose types -typedef enum Layout_ { kRowMajor = 101, kColMajor = 102 } Layout; -typedef enum Transpose_ { kNo = 111, kYes = 112, kConjugate = 113 } Transpose; -typedef enum Triangle_ { kUpper = 121, kLower = 122 } Triangle; -typedef enum Diagonal_ { kNonUnit = 131, kUnit = 132 } Diagonal; -typedef enum Side_ { kLeft = 141, kRight = 142 } Side; - +typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101, + CLBlastLayoutColMajor = 102 } CLBlastLayout; +typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112, + CLBlastTransposeConjugate = 113 } CLBlastTranspose; +typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121, + CLBlastTriangleLower = 122 } CLBlastTriangle; +typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, + CLBlastDiagonalUnit = 132 } CLBlastDiagonal; +typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; // ================================================================================================= // BLAS level-1 (vector-vector) routines @@ -270,28 +273,28 @@ void PUBLIC_API cblas_izmin(const int n, // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV -void PUBLIC_API cblas_sgemv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); -void PUBLIC_API cblas_dgemv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_cgemv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -300,28 +303,28 @@ void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose, void* y, const int y_inc); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV -void PUBLIC_API cblas_sgbmv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); -void PUBLIC_API cblas_dgbmv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_cgbmv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, @@ -330,14 +333,14 @@ void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose, void* y, const int y_inc); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV -void PUBLIC_API cblas_chemv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_zhemv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, @@ -346,14 +349,14 @@ void PUBLIC_API cblas_zhemv(const Layout layout, const Triangle triangle, void* y, const int y_inc); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV -void PUBLIC_API cblas_chbmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_zhbmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -362,14 +365,14 @@ void PUBLIC_API cblas_zhbmv(const Layout layout, const Triangle triangle, void* y, const int y_inc); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV -void PUBLIC_API cblas_chpmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_zhpmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, @@ -378,14 +381,14 @@ void PUBLIC_API cblas_zhpmv(const Layout layout, const Triangle triangle, void* y, const int y_inc); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV -void PUBLIC_API cblas_ssymv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); -void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* a, const int a_ld, @@ -394,14 +397,14 @@ void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle, double* y, const int y_inc); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV -void PUBLIC_API cblas_ssbmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); -void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const double alpha, const double* a, const int a_ld, @@ -410,14 +413,14 @@ void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle, double* y, const int y_inc); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV -void PUBLIC_API cblas_sspmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* ap, const float* x, const int x_inc, const float beta, float* y, const int y_inc); -void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* ap, @@ -426,121 +429,121 @@ void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle, double* y, const int y_inc); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV -void PUBLIC_API cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc); -void PUBLIC_API cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc); -void PUBLIC_API cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV -void PUBLIC_API cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc); -void PUBLIC_API cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc); -void PUBLIC_API cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV -void PUBLIC_API cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc); -void PUBLIC_API cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc); -void PUBLIC_API cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); -void PUBLIC_API cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV -void PUBLIC_API cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc); -void PUBLIC_API cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc); -void PUBLIC_API cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV -void PUBLIC_API cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc); -void PUBLIC_API cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc); -void PUBLIC_API cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV -void PUBLIC_API cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc); -void PUBLIC_API cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc); -void PUBLIC_API cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); -void PUBLIC_API cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); // General rank-1 matrix update: SGER/DGER/HGER -void PUBLIC_API cblas_sger(const Layout layout, +void PUBLIC_API cblas_sger(const CLBlastLayout layout, const int m, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld); -void PUBLIC_API cblas_dger(const Layout layout, +void PUBLIC_API cblas_dger(const CLBlastLayout layout, const int m, const int n, const double alpha, const double* x, const int x_inc, @@ -548,13 +551,13 @@ void PUBLIC_API cblas_dger(const Layout layout, double* a, const int a_ld); // General rank-1 complex matrix update: CGERU/ZGERU -void PUBLIC_API cblas_cgeru(const Layout layout, +void PUBLIC_API cblas_cgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); -void PUBLIC_API cblas_zgeru(const Layout layout, +void PUBLIC_API cblas_zgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -562,13 +565,13 @@ void PUBLIC_API cblas_zgeru(const Layout layout, void* a, const int a_ld); // General rank-1 complex conjugated matrix update: CGERC/ZGERC -void PUBLIC_API cblas_cgerc(const Layout layout, +void PUBLIC_API cblas_cgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); -void PUBLIC_API cblas_zgerc(const Layout layout, +void PUBLIC_API cblas_zgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -576,37 +579,37 @@ void PUBLIC_API cblas_zgerc(const Layout layout, void* a, const int a_ld); // Hermitian rank-1 matrix update: CHER/ZHER -void PUBLIC_API cblas_cher(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, void* a, const int a_ld); -void PUBLIC_API cblas_zher(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, void* a, const int a_ld); // Hermitian packed rank-1 matrix update: CHPR/ZHPR -void PUBLIC_API cblas_chpr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, void* ap); -void PUBLIC_API cblas_zhpr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, void* ap); // Hermitian rank-2 matrix update: CHER2/ZHER2 -void PUBLIC_API cblas_cher2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); -void PUBLIC_API cblas_zher2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -614,13 +617,13 @@ void PUBLIC_API cblas_zher2(const Layout layout, const Triangle triangle, void* a, const int a_ld); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 -void PUBLIC_API cblas_chpr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* ap); -void PUBLIC_API cblas_zhpr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -628,37 +631,37 @@ void PUBLIC_API cblas_zhpr2(const Layout layout, const Triangle triangle, void* ap); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR -void PUBLIC_API cblas_ssyr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, float* a, const int a_ld); -void PUBLIC_API cblas_dsyr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, double* a, const int a_ld); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR -void PUBLIC_API cblas_sspr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, float* ap); -void PUBLIC_API cblas_dspr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, double* ap); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 -void PUBLIC_API cblas_ssyr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld); -void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -666,13 +669,13 @@ void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle, double* a, const int a_ld); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 -void PUBLIC_API cblas_sspr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* ap); -void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -684,28 +687,28 @@ void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle, // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM -void PUBLIC_API cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld); -void PUBLIC_API cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld); -void PUBLIC_API cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -714,28 +717,28 @@ void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, co void* c, const int c_ld); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM -void PUBLIC_API cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld); -void PUBLIC_API cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld); -void PUBLIC_API cblas_csymm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -744,14 +747,14 @@ void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle void* c, const int c_ld); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM -void PUBLIC_API cblas_chemm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -760,25 +763,25 @@ void PUBLIC_API cblas_zhemm(const Layout layout, const Side side, const Triangle void* c, const int c_ld); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK -void PUBLIC_API cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, const float beta, float* c, const int c_ld); -void PUBLIC_API cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, const double beta, double* c, const int c_ld); -void PUBLIC_API cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -786,13 +789,13 @@ void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const void* c, const int c_ld); // Rank-K update of a hermitian matrix: CHERK/ZHERK -void PUBLIC_API cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -800,28 +803,28 @@ void PUBLIC_API cblas_zherk(const Layout layout, const Triangle triangle, const void* c, const int c_ld); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K -void PUBLIC_API cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld); -void PUBLIC_API cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld); -void PUBLIC_API cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -830,14 +833,14 @@ void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const void* c, const int c_ld); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K -void PUBLIC_API cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -846,44 +849,44 @@ void PUBLIC_API cblas_zher2k(const Layout layout, const Triangle triangle, const void* c, const int c_ld); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM -void PUBLIC_API cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld); -void PUBLIC_API cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld); -void PUBLIC_API cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM -void PUBLIC_API cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld); -void PUBLIC_API cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld); -void PUBLIC_API cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -894,22 +897,22 @@ void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle // ================================================================================================= // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY -void PUBLIC_API cblas_somatcopy(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld); -void PUBLIC_API cblas_domatcopy(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld); -void PUBLIC_API cblas_comatcopy(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_zomatcopy(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 99edf355..a9169872 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,7 +41,7 @@ FILES = [ "/include/clblast_blas.h", "/src/clblast_blas.cpp", ] -HEADER_LINES = [117, 73, 118, 22, 29, 41, 44, 32] +HEADER_LINES = [117, 73, 118, 22, 29, 41, 47, 32] FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 3] # Different possibilities for requirements @@ -67,8 +67,8 @@ def size_helper(condition, size_one, size_two, multiplier): def layout_transpose_condition(prefix): - return "(layout == Layout::kColMajor && " + prefix + "_transpose != Transpose::kNo) || " +\ - "(layout == Layout::kRowMajor && " + prefix + "_transpose == Transpose::kNo)" + return "(layout == CLBlastLayoutColMajor && " + prefix + "_transpose != CLBlastTransposeNo) || " +\ + "(layout == CLBlastLayoutRowMajor && " + prefix + "_transpose == CLBlastTransposeNo)" # Different possibilities for the vector and matrix sizes @@ -79,20 +79,20 @@ ym = "m * y_inc" an = "n * a_ld" apn = "((n*(n+1)) / 2)" cn = "n * c_ld" -xmn = size_helper("a_transpose != Transpose::kNo", "m", "n", "x_inc") -ynm = size_helper("a_transpose != Transpose::kNo", "n", "m", "y_inc") -amn = size_helper("layout == Layout::kRowMajor", "m", "n", "a_ld") -amns = size_helper("side == Side::kLeft", "m", "n", "a_ld") +xmn = size_helper("a_transpose != CLBlastTransposeNo", "m", "n", "x_inc") +ynm = size_helper("a_transpose != CLBlastTransposeNo", "n", "m", "y_inc") +amn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "a_ld") +amns = size_helper("side == CLBlastSideLeft", "m", "n", "a_ld") amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld") ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld") ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld") bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld") bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld") -bmn = size_helper("layout == Layout::kRowMajor", "m", "n", "b_ld") +bmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "b_ld") bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld") -cmn = size_helper("layout == Layout::kRowMajor", "m", "n", "c_ld") -ammn = size_helper("layout == Layout::kRowMajor", "m", "((side == Side::kLeft) ? m : n)", "a_ld") -bmnn = size_helper("layout == Layout::kRowMajor", "((side == Side::kLeft) ? m : n)", "n", "b_ld") +cmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "c_ld") +ammn = size_helper("layout == CLBlastLayoutRowMajor", "m", "((side == CLBlastSideLeft) ? m : n)", "a_ld") +bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft) ? m : n)", "n", "b_ld") # ================================================================================================== diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 23a2207c..eafbea30 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -126,6 +126,7 @@ def clblast_blas_cc(routine): # Copy data structures to the device for i, name in enumerate(routine.inputs + routine.outputs): result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL + for i, name in enumerate(routine.inputs + routine.outputs): result += " " + routine.create_buffer(name, flavour.buffer_type) + NL for name in routine.inputs + routine.outputs: prefix = "" if name in routine.outputs else "const " diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index b988c91a..c35f5b4c 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -548,7 +548,7 @@ class Routine: def arguments_def_netlib(self, flavour): """As above, but for the Netlib CBLAS API""" - return (self.options_def() + self.sizes_def_netlib() + + return (self.options_def_c() + self.sizes_def_netlib() + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) + self.scalar_def_void("alpha", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) + diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp index b5451049..974dc21a 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_blas.cpp @@ -44,12 +44,12 @@ void cblas_srotg(float* sa, auto context = Context(device); auto queue = Queue(context, device); const auto sa_size = 1; - auto sa_buffer = Buffer(context, sa_size); const auto sb_size = 1; - auto sb_buffer = Buffer(context, sb_size); const auto sc_size = 1; - auto sc_buffer = Buffer(context, sc_size); const auto ss_size = 1; + auto sa_buffer = Buffer(context, sa_size); + auto sb_buffer = Buffer(context, sb_size); + auto sc_buffer = Buffer(context, sc_size); auto ss_buffer = Buffer(context, ss_size); sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); @@ -77,12 +77,12 @@ void cblas_drotg(double* sa, auto context = Context(device); auto queue = Queue(context, device); const auto sa_size = 1; - auto sa_buffer = Buffer(context, sa_size); const auto sb_size = 1; - auto sb_buffer = Buffer(context, sb_size); const auto sc_size = 1; - auto sc_buffer = Buffer(context, sc_size); const auto ss_size = 1; + auto sa_buffer = Buffer(context, sa_size); + auto sb_buffer = Buffer(context, sb_size); + auto sc_buffer = Buffer(context, sc_size); auto ss_buffer = Buffer(context, ss_size); sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); @@ -113,14 +113,14 @@ void cblas_srotmg(float* sd1, auto context = Context(device); auto queue = Queue(context, device); const auto sy1_size = 1; - auto sy1_buffer = Buffer(context, sy1_size); const auto sd1_size = 1; - auto sd1_buffer = Buffer(context, sd1_size); const auto sd2_size = 1; - auto sd2_buffer = Buffer(context, sd2_size); const auto sx1_size = 1; - auto sx1_buffer = Buffer(context, sx1_size); const auto sparam_size = 1; + auto sy1_buffer = Buffer(context, sy1_size); + auto sd1_buffer = Buffer(context, sd1_size); + auto sd2_buffer = Buffer(context, sd2_size); + auto sx1_buffer = Buffer(context, sx1_size); auto sparam_buffer = Buffer(context, sparam_size); sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); @@ -151,14 +151,14 @@ void cblas_drotmg(double* sd1, auto context = Context(device); auto queue = Queue(context, device); const auto sy1_size = 1; - auto sy1_buffer = Buffer(context, sy1_size); const auto sd1_size = 1; - auto sd1_buffer = Buffer(context, sd1_size); const auto sd2_size = 1; - auto sd2_buffer = Buffer(context, sd2_size); const auto sx1_size = 1; - auto sx1_buffer = Buffer(context, sx1_size); const auto sparam_size = 1; + auto sy1_buffer = Buffer(context, sy1_size); + auto sd1_buffer = Buffer(context, sd1_size); + auto sd2_buffer = Buffer(context, sd2_size); + auto sx1_buffer = Buffer(context, sx1_size); auto sparam_buffer = Buffer(context, sparam_size); sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); @@ -191,8 +191,8 @@ void cblas_srot(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -218,8 +218,8 @@ void cblas_drot(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -246,10 +246,10 @@ void cblas_srotm(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto sparam_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto sparam_buffer = Buffer(context, sparam_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -275,10 +275,10 @@ void cblas_drotm(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto sparam_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto sparam_buffer = Buffer(context, sparam_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -305,8 +305,8 @@ void cblas_sswap(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -328,8 +328,8 @@ void cblas_dswap(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -351,8 +351,8 @@ void cblas_cswap(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -374,8 +374,8 @@ void cblas_zswap(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -481,8 +481,8 @@ void cblas_scopy(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -503,8 +503,8 @@ void cblas_dcopy(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -525,8 +525,8 @@ void cblas_ccopy(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -547,8 +547,8 @@ void cblas_zcopy(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -573,8 +573,8 @@ void cblas_saxpy(const int n, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -598,8 +598,8 @@ void cblas_daxpy(const int n, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -623,8 +623,8 @@ void cblas_caxpy(const int n, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -648,8 +648,8 @@ void cblas_zaxpy(const int n, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -674,10 +674,10 @@ void cblas_sdot(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -701,10 +701,10 @@ void cblas_ddot(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -730,10 +730,10 @@ void cblas_cdotu(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -757,10 +757,10 @@ void cblas_zdotu(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -786,10 +786,10 @@ void cblas_cdotc(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -813,10 +813,10 @@ void cblas_zdotc(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -841,8 +841,8 @@ void cblas_snrm2(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto nrm2_size = 1; + auto x_buffer = Buffer(context, x_size); auto nrm2_buffer = Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); @@ -863,8 +863,8 @@ void cblas_dnrm2(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto nrm2_size = 1; + auto x_buffer = Buffer(context, x_size); auto nrm2_buffer = Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); @@ -885,8 +885,8 @@ void cblas_scnrm2(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto nrm2_size = 1; + auto x_buffer = Buffer(context, x_size); auto nrm2_buffer = Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); @@ -907,8 +907,8 @@ void cblas_dznrm2(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto nrm2_size = 1; + auto x_buffer = Buffer(context, x_size); auto nrm2_buffer = Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); @@ -931,8 +931,8 @@ void cblas_sasum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto asum_size = 1; + auto x_buffer = Buffer(context, x_size); auto asum_buffer = Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); @@ -953,8 +953,8 @@ void cblas_dasum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto asum_size = 1; + auto x_buffer = Buffer(context, x_size); auto asum_buffer = Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); @@ -975,8 +975,8 @@ void cblas_scasum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto asum_size = 1; + auto x_buffer = Buffer(context, x_size); auto asum_buffer = Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); @@ -997,8 +997,8 @@ void cblas_dzasum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto asum_size = 1; + auto x_buffer = Buffer(context, x_size); auto asum_buffer = Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); @@ -1021,8 +1021,8 @@ void cblas_ssum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto sum_size = 1; + auto x_buffer = Buffer(context, x_size); auto sum_buffer = Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); @@ -1043,8 +1043,8 @@ void cblas_dsum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto sum_size = 1; + auto x_buffer = Buffer(context, x_size); auto sum_buffer = Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); @@ -1065,8 +1065,8 @@ void cblas_scsum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto sum_size = 1; + auto x_buffer = Buffer(context, x_size); auto sum_buffer = Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); @@ -1087,8 +1087,8 @@ void cblas_dzsum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto sum_size = 1; + auto x_buffer = Buffer(context, x_size); auto sum_buffer = Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); @@ -1111,8 +1111,8 @@ void cblas_isamax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1133,8 +1133,8 @@ void cblas_idamax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1155,8 +1155,8 @@ void cblas_icamax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1177,8 +1177,8 @@ void cblas_izamax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1201,8 +1201,8 @@ void cblas_ismax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1223,8 +1223,8 @@ void cblas_idmax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1245,8 +1245,8 @@ void cblas_icmax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1267,8 +1267,8 @@ void cblas_izmax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1291,8 +1291,8 @@ void cblas_ismin(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imin_size = 1; + auto x_buffer = Buffer(context, x_size); auto imin_buffer = Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); @@ -1313,8 +1313,8 @@ void cblas_idmin(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imin_size = 1; + auto x_buffer = Buffer(context, x_size); auto imin_buffer = Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); @@ -1335,8 +1335,8 @@ void cblas_icmin(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imin_size = 1; + auto x_buffer = Buffer(context, x_size); auto imin_buffer = Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); @@ -1357,8 +1357,8 @@ void cblas_izmin(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imin_size = 1; + auto x_buffer = Buffer(context, x_size); auto imin_buffer = Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); @@ -1378,7 +1378,7 @@ void cblas_izmin(const int n, // ================================================================================================= // GEMV -void cblas_sgemv(const Layout layout, const Transpose a_transpose, +void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, @@ -1390,11 +1390,11 @@ void cblas_sgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1414,7 +1414,7 @@ void cblas_sgemv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_dgemv(const Layout layout, const Transpose a_transpose, +void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, @@ -1426,11 +1426,11 @@ void cblas_dgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1450,7 +1450,7 @@ void cblas_dgemv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_cgemv(const Layout layout, const Transpose a_transpose, +void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -1462,11 +1462,11 @@ void cblas_cgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1486,7 +1486,7 @@ void cblas_cgemv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_zgemv(const Layout layout, const Transpose a_transpose, +void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -1498,11 +1498,11 @@ void cblas_zgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1524,7 +1524,7 @@ void cblas_zgemv(const Layout layout, const Transpose a_transpose, } // GBMV -void cblas_sgbmv(const Layout layout, const Transpose a_transpose, +void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const float alpha, const float* a, const int a_ld, @@ -1536,11 +1536,11 @@ void cblas_sgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1560,7 +1560,7 @@ void cblas_sgbmv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_dgbmv(const Layout layout, const Transpose a_transpose, +void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const double alpha, const double* a, const int a_ld, @@ -1572,11 +1572,11 @@ void cblas_dgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1596,7 +1596,7 @@ void cblas_dgbmv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_cgbmv(const Layout layout, const Transpose a_transpose, +void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, @@ -1608,11 +1608,11 @@ void cblas_cgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1632,7 +1632,7 @@ void cblas_cgbmv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_zgbmv(const Layout layout, const Transpose a_transpose, +void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, @@ -1644,11 +1644,11 @@ void cblas_zgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1670,7 +1670,7 @@ void cblas_zgbmv(const Layout layout, const Transpose a_transpose, } // HEMV -void cblas_chemv(const Layout layout, const Triangle triangle, +void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, @@ -1683,10 +1683,10 @@ void cblas_chemv(const Layout layout, const Triangle triangle, const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1706,7 +1706,7 @@ void cblas_chemv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_zhemv(const Layout layout, const Triangle triangle, +void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, @@ -1719,10 +1719,10 @@ void cblas_zhemv(const Layout layout, const Triangle triangle, const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1744,7 +1744,7 @@ void cblas_zhemv(const Layout layout, const Triangle triangle, } // HBMV -void cblas_chbmv(const Layout layout, const Triangle triangle, +void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -1757,10 +1757,10 @@ void cblas_chbmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1780,7 +1780,7 @@ void cblas_chbmv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_zhbmv(const Layout layout, const Triangle triangle, +void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -1793,10 +1793,10 @@ void cblas_zhbmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1818,7 +1818,7 @@ void cblas_zhbmv(const Layout layout, const Triangle triangle, } // HPMV -void cblas_chpmv(const Layout layout, const Triangle triangle, +void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, @@ -1831,10 +1831,10 @@ void cblas_chpmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1854,7 +1854,7 @@ void cblas_chpmv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_zhpmv(const Layout layout, const Triangle triangle, +void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, @@ -1867,10 +1867,10 @@ void cblas_zhpmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1892,7 +1892,7 @@ void cblas_zhpmv(const Layout layout, const Triangle triangle, } // SYMV -void cblas_ssymv(const Layout layout, const Triangle triangle, +void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* a, const int a_ld, @@ -1905,10 +1905,10 @@ void cblas_ssymv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1928,7 +1928,7 @@ void cblas_ssymv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_dsymv(const Layout layout, const Triangle triangle, +void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* a, const int a_ld, @@ -1941,10 +1941,10 @@ void cblas_dsymv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1966,7 +1966,7 @@ void cblas_dsymv(const Layout layout, const Triangle triangle, } // SBMV -void cblas_ssbmv(const Layout layout, const Triangle triangle, +void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const float alpha, const float* a, const int a_ld, @@ -1979,10 +1979,10 @@ void cblas_ssbmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2002,7 +2002,7 @@ void cblas_ssbmv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_dsbmv(const Layout layout, const Triangle triangle, +void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const double alpha, const double* a, const int a_ld, @@ -2015,10 +2015,10 @@ void cblas_dsbmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2040,7 +2040,7 @@ void cblas_dsbmv(const Layout layout, const Triangle triangle, } // SPMV -void cblas_sspmv(const Layout layout, const Triangle triangle, +void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* ap, @@ -2053,10 +2053,10 @@ void cblas_sspmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2076,7 +2076,7 @@ void cblas_sspmv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_dspmv(const Layout layout, const Triangle triangle, +void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* ap, @@ -2089,10 +2089,10 @@ void cblas_dspmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2114,7 +2114,7 @@ void cblas_dspmv(const Layout layout, const Triangle triangle, } // TRMV -void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc) { @@ -2122,8 +2122,8 @@ void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2141,7 +2141,7 @@ void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc) { @@ -2149,8 +2149,8 @@ void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2168,7 +2168,7 @@ void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2176,8 +2176,8 @@ void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2195,7 +2195,7 @@ void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2203,8 +2203,8 @@ void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2224,7 +2224,7 @@ void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a } // TBMV -void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc) { @@ -2232,8 +2232,8 @@ void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2251,7 +2251,7 @@ void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc) { @@ -2259,8 +2259,8 @@ void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2278,7 +2278,7 @@ void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2286,8 +2286,8 @@ void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2305,7 +2305,7 @@ void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2313,8 +2313,8 @@ void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2334,7 +2334,7 @@ void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a } // TPMV -void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc) { @@ -2342,8 +2342,8 @@ void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2361,7 +2361,7 @@ void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc) { @@ -2369,8 +2369,8 @@ void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2388,7 +2388,7 @@ void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { @@ -2396,8 +2396,8 @@ void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2415,7 +2415,7 @@ void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { @@ -2423,8 +2423,8 @@ void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2444,7 +2444,7 @@ void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a } // TRSV -void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc) { @@ -2452,8 +2452,8 @@ void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2471,7 +2471,7 @@ void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc) { @@ -2479,8 +2479,8 @@ void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2498,7 +2498,7 @@ void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2506,8 +2506,8 @@ void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2525,7 +2525,7 @@ void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2533,8 +2533,8 @@ void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2554,7 +2554,7 @@ void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a } // TBSV -void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc) { @@ -2562,8 +2562,8 @@ void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2581,7 +2581,7 @@ void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc) { @@ -2589,8 +2589,8 @@ void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2608,7 +2608,7 @@ void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2616,8 +2616,8 @@ void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2635,7 +2635,7 @@ void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2643,8 +2643,8 @@ void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2664,7 +2664,7 @@ void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a } // TPSV -void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc) { @@ -2672,8 +2672,8 @@ void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2691,7 +2691,7 @@ void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc) { @@ -2699,8 +2699,8 @@ void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2718,7 +2718,7 @@ void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { @@ -2726,8 +2726,8 @@ void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2745,7 +2745,7 @@ void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { @@ -2753,8 +2753,8 @@ void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2774,7 +2774,7 @@ void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a } // GER -void cblas_sger(const Layout layout, +void cblas_sger(const CLBlastLayout layout, const int m, const int n, const float alpha, const float* x, const int x_inc, @@ -2785,10 +2785,10 @@ void cblas_sger(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2806,7 +2806,7 @@ void cblas_sger(const Layout layout, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_dger(const Layout layout, +void cblas_dger(const CLBlastLayout layout, const int m, const int n, const double alpha, const double* x, const int x_inc, @@ -2817,10 +2817,10 @@ void cblas_dger(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2840,7 +2840,7 @@ void cblas_dger(const Layout layout, } // GERU -void cblas_cgeru(const Layout layout, +void cblas_cgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -2851,10 +2851,10 @@ void cblas_cgeru(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2872,7 +2872,7 @@ void cblas_cgeru(const Layout layout, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_zgeru(const Layout layout, +void cblas_zgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -2883,10 +2883,10 @@ void cblas_zgeru(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2906,7 +2906,7 @@ void cblas_zgeru(const Layout layout, } // GERC -void cblas_cgerc(const Layout layout, +void cblas_cgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -2917,10 +2917,10 @@ void cblas_cgerc(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2938,7 +2938,7 @@ void cblas_cgerc(const Layout layout, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_zgerc(const Layout layout, +void cblas_zgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -2949,10 +2949,10 @@ void cblas_zgerc(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2972,7 +2972,7 @@ void cblas_zgerc(const Layout layout, } // HER -void cblas_cher(const Layout layout, const Triangle triangle, +void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -2982,8 +2982,8 @@ void cblas_cher(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3000,7 +3000,7 @@ void cblas_cher(const Layout layout, const Triangle triangle, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_zher(const Layout layout, const Triangle triangle, +void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3010,8 +3010,8 @@ void cblas_zher(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3030,7 +3030,7 @@ void cblas_zher(const Layout layout, const Triangle triangle, } // HPR -void cblas_chpr(const Layout layout, const Triangle triangle, +void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3040,8 +3040,8 @@ void cblas_chpr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3058,7 +3058,7 @@ void cblas_chpr(const Layout layout, const Triangle triangle, } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } -void cblas_zhpr(const Layout layout, const Triangle triangle, +void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3068,8 +3068,8 @@ void cblas_zhpr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3088,7 +3088,7 @@ void cblas_zhpr(const Layout layout, const Triangle triangle, } // HER2 -void cblas_cher2(const Layout layout, const Triangle triangle, +void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3099,10 +3099,10 @@ void cblas_cher2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3121,7 +3121,7 @@ void cblas_cher2(const Layout layout, const Triangle triangle, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_zher2(const Layout layout, const Triangle triangle, +void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3132,10 +3132,10 @@ void cblas_zher2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3156,7 +3156,7 @@ void cblas_zher2(const Layout layout, const Triangle triangle, } // HPR2 -void cblas_chpr2(const Layout layout, const Triangle triangle, +void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3167,10 +3167,10 @@ void cblas_chpr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3189,7 +3189,7 @@ void cblas_chpr2(const Layout layout, const Triangle triangle, } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } -void cblas_zhpr2(const Layout layout, const Triangle triangle, +void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3200,10 +3200,10 @@ void cblas_zhpr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3224,7 +3224,7 @@ void cblas_zhpr2(const Layout layout, const Triangle triangle, } // SYR -void cblas_ssyr(const Layout layout, const Triangle triangle, +void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, @@ -3234,8 +3234,8 @@ void cblas_ssyr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3252,7 +3252,7 @@ void cblas_ssyr(const Layout layout, const Triangle triangle, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_dsyr(const Layout layout, const Triangle triangle, +void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -3262,8 +3262,8 @@ void cblas_dsyr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3282,7 +3282,7 @@ void cblas_dsyr(const Layout layout, const Triangle triangle, } // SPR -void cblas_sspr(const Layout layout, const Triangle triangle, +void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, @@ -3292,8 +3292,8 @@ void cblas_sspr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3310,7 +3310,7 @@ void cblas_sspr(const Layout layout, const Triangle triangle, } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } -void cblas_dspr(const Layout layout, const Triangle triangle, +void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -3320,8 +3320,8 @@ void cblas_dspr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3340,7 +3340,7 @@ void cblas_dspr(const Layout layout, const Triangle triangle, } // SYR2 -void cblas_ssyr2(const Layout layout, const Triangle triangle, +void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, @@ -3351,10 +3351,10 @@ void cblas_ssyr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3373,7 +3373,7 @@ void cblas_ssyr2(const Layout layout, const Triangle triangle, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_dsyr2(const Layout layout, const Triangle triangle, +void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -3384,10 +3384,10 @@ void cblas_dsyr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3408,7 +3408,7 @@ void cblas_dsyr2(const Layout layout, const Triangle triangle, } // SPR2 -void cblas_sspr2(const Layout layout, const Triangle triangle, +void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, @@ -3419,10 +3419,10 @@ void cblas_sspr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3441,7 +3441,7 @@ void cblas_sspr2(const Layout layout, const Triangle triangle, } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } -void cblas_dspr2(const Layout layout, const Triangle triangle, +void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -3452,10 +3452,10 @@ void cblas_dspr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3480,7 +3480,7 @@ void cblas_dspr2(const Layout layout, const Triangle triangle, // ================================================================================================= // GEMM -void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const float alpha, const float* a, const int a_ld, @@ -3492,11 +3492,11 @@ void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3517,7 +3517,7 @@ void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpo } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const double alpha, const double* a, const int a_ld, @@ -3529,11 +3529,11 @@ void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3554,7 +3554,7 @@ void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpo } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -3566,11 +3566,11 @@ void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3591,7 +3591,7 @@ void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpo } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -3603,11 +3603,11 @@ void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3630,7 +3630,7 @@ void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpo } // SYMM -void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, +void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const float alpha, const float* a, const int a_ld, @@ -3642,11 +3642,11 @@ void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3667,7 +3667,7 @@ void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, +void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const double alpha, const double* a, const int a_ld, @@ -3679,11 +3679,11 @@ void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3704,7 +3704,7 @@ void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, +void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -3716,11 +3716,11 @@ void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3741,7 +3741,7 @@ void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, +void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -3753,11 +3753,11 @@ void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3780,7 +3780,7 @@ void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, } // HEMM -void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, +void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -3792,11 +3792,11 @@ void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3817,7 +3817,7 @@ void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, +void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -3829,11 +3829,11 @@ void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3856,7 +3856,7 @@ void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, } // SYRK -void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, @@ -3867,9 +3867,9 @@ void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3888,7 +3888,7 @@ void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, @@ -3899,9 +3899,9 @@ void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3920,7 +3920,7 @@ void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -3931,9 +3931,9 @@ void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3952,7 +3952,7 @@ void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -3963,9 +3963,9 @@ void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3986,7 +3986,7 @@ void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a } // HERK -void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -3997,9 +3997,9 @@ void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -4018,7 +4018,7 @@ void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -4029,9 +4029,9 @@ void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -4052,7 +4052,7 @@ void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a } // SYR2K -void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, @@ -4064,11 +4064,11 @@ void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; - auto b_buffer = Buffer(context, b_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); + auto b_buffer = Buffer(context, b_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4089,7 +4089,7 @@ void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, @@ -4101,11 +4101,11 @@ void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; - auto b_buffer = Buffer(context, b_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); + auto b_buffer = Buffer(context, b_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4126,7 +4126,7 @@ void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -4138,11 +4138,11 @@ void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; - auto b_buffer = Buffer(context, b_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); + auto b_buffer = Buffer(context, b_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4163,7 +4163,7 @@ void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -4175,11 +4175,11 @@ void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; - auto b_buffer = Buffer(context, b_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); + auto b_buffer = Buffer(context, b_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4202,7 +4202,7 @@ void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose } // HER2K -void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -4214,11 +4214,11 @@ void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; - auto b_buffer = Buffer(context, b_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); + auto b_buffer = Buffer(context, b_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4239,7 +4239,7 @@ void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -4251,11 +4251,11 @@ void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; - auto b_buffer = Buffer(context, b_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); + auto b_buffer = Buffer(context, b_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4278,7 +4278,7 @@ void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose } // TRMM -void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, @@ -4287,9 +4287,9 @@ void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4309,7 +4309,7 @@ void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, @@ -4318,9 +4318,9 @@ void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4340,7 +4340,7 @@ void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4349,9 +4349,9 @@ void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4371,7 +4371,7 @@ void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4380,9 +4380,9 @@ void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4404,7 +4404,7 @@ void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, } // TRSM -void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, @@ -4413,9 +4413,9 @@ void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4435,7 +4435,7 @@ void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, @@ -4444,9 +4444,9 @@ void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4466,7 +4466,7 @@ void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4475,9 +4475,9 @@ void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4497,7 +4497,7 @@ void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4506,9 +4506,9 @@ void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4534,7 +4534,7 @@ void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, // ================================================================================================= // OMATCOPY -void cblas_somatcopy(const Layout layout, const Transpose a_transpose, +void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, @@ -4543,9 +4543,9 @@ void cblas_somatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4562,7 +4562,7 @@ void cblas_somatcopy(const Layout layout, const Transpose a_transpose, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_domatcopy(const Layout layout, const Transpose a_transpose, +void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, @@ -4571,9 +4571,9 @@ void cblas_domatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4590,7 +4590,7 @@ void cblas_domatcopy(const Layout layout, const Transpose a_transpose, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_comatcopy(const Layout layout, const Transpose a_transpose, +void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4599,9 +4599,9 @@ void cblas_comatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4618,7 +4618,7 @@ void cblas_comatcopy(const Layout layout, const Transpose a_transpose, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, +void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4627,9 +4627,9 @@ void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); From 729862e87338dbd275f90d61d52803892fe3648e Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 19:56:42 +0200 Subject: [PATCH 06/15] Fixed some issues preventing the Netlib CBLAS API from linking correctly --- include/clblast_blas.h | 20 ++++++++++---------- scripts/generator/generator/routine.py | 4 ++-- src/clblast_blas.cpp | 20 ++++++++++---------- src/utilities/utilities.cpp | 4 ++++ 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/include/clblast_blas.h b/include/clblast_blas.h index 927f84cd..ff560712 100644 --- a/include/clblast_blas.h +++ b/include/clblast_blas.h @@ -581,24 +581,24 @@ void PUBLIC_API cblas_zgerc(const CLBlastLayout layout, // Hermitian rank-1 matrix update: CHER/ZHER void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const float alpha, const void* x, const int x_inc, void* a, const int a_ld); void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const double alpha, const void* x, const int x_inc, void* a, const int a_ld); // Hermitian packed rank-1 matrix update: CHPR/ZHPR void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const float alpha, const void* x, const int x_inc, void* ap); void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const double alpha, const void* x, const int x_inc, void* ap); @@ -791,15 +791,15 @@ void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle tr // Rank-K update of a hermitian matrix: CHERK/ZHERK void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, - const void* alpha, + const float alpha, const void* a, const int a_ld, - const void* beta, + const float beta, void* c, const int c_ld); void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, - const void* alpha, + const double alpha, const void* a, const int a_ld, - const void* beta, + const double beta, void* c, const int c_ld); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K @@ -838,14 +838,14 @@ void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle t const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, - const void* beta, + const float beta, void* c, const int c_ld); void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, - const void* beta, + const double beta, void* c, const int c_ld); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index c35f5b4c..085845a8 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -341,9 +341,9 @@ class Routine: """Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types""" if name in self.scalars: if name == "alpha": - data_type = "void*" if flavour.is_non_standard() else flavour.alpha_cpp + data_type = "void*" if flavour.is_complex("alpha") else flavour.alpha_cpp return ["const " + data_type + " " + name] - data_type = "void*" if flavour.is_non_standard() else flavour.beta_cpp + data_type = "void*" if flavour.is_complex("beta") else flavour.beta_cpp return ["const " + data_type + " " + name] return [] diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp index 974dc21a..9b59a20d 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_blas.cpp @@ -2974,7 +2974,7 @@ void cblas_zgerc(const CLBlastLayout layout, // HER void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const float alpha, const void* x, const int x_inc, void* a, const int a_ld) { auto device = get_device(); @@ -3002,7 +3002,7 @@ void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, } void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const double alpha, const void* x, const int x_inc, void* a, const int a_ld) { auto device = get_device(); @@ -3032,7 +3032,7 @@ void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, // HPR void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const float alpha, const void* x, const int x_inc, void* ap) { auto device = get_device(); @@ -3060,7 +3060,7 @@ void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, } void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const double alpha, const void* x, const int x_inc, void* ap) { auto device = get_device(); @@ -3988,9 +3988,9 @@ void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con // HERK void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, - const void* alpha, + const float alpha, const void* a, const int a_ld, - const void* beta, + const float beta, void* c, const int c_ld) { auto device = get_device(); auto context = Context(device); @@ -4020,9 +4020,9 @@ void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con } void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, - const void* alpha, + const double alpha, const void* a, const int a_ld, - const void* beta, + const double beta, void* c, const int c_ld) { auto device = get_device(); auto context = Context(device); @@ -4207,7 +4207,7 @@ void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, - const void* beta, + const float beta, void* c, const int c_ld) { auto device = get_device(); auto context = Context(device); @@ -4244,7 +4244,7 @@ void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, - const void* beta, + const double beta, void* c, const int c_ld) { auto device = get_device(); auto context = Context(device); diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index b4a18311..24456252 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -151,6 +151,10 @@ std::string ToString(Precision value) { case Precision::kComplexDouble: return ToString(static_cast(value))+" (complex-double)"; } } +template <> +std::string ToString(StatusCode value) { + return std::to_string(static_cast(value)); +} // ================================================================================================= From 140121ef91cc13892711f57da0d046f88cf55301 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 20:21:50 +0200 Subject: [PATCH 07/15] Removed the clblast namespace from the Netlib C API source file to ensure proper linking --- CMakeLists.txt | 2 +- scripts/generator/generator.py | 2 +- scripts/generator/generator/cpp.py | 12 +- scripts/generator/generator/routine.py | 2 +- src/clblast_blas.cpp | 4181 ++++++++++++------------ 5 files changed, 2099 insertions(+), 2100 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d2034617..1fff1a3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,7 +163,6 @@ set(PRECISIONS 32 64 3232 6464 16) # Gathers all source-files set(SOURCES - src/clblast_blas.cpp src/database/database.cpp src/routines/common.cpp src/utilities/clblast_exceptions.cpp @@ -171,6 +170,7 @@ set(SOURCES src/cache.cpp src/clblast.cpp src/clblast_c.cpp + src/clblast_blas.cpp src/routine.cpp ) foreach(ROUTINE ${LEVEL1_ROUTINES}) diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index a9169872..65d40877 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -42,7 +42,7 @@ FILES = [ "/src/clblast_blas.cpp", ] HEADER_LINES = [117, 73, 118, 22, 29, 41, 47, 32] -FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 3] +FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index eafbea30..60e29a07 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -112,13 +112,13 @@ def clblast_blas_cc(routine): # There is a version available in CBLAS if flavour.precision_name in ["S", "D", "C", "Z"]: template = "<" + flavour.template + ">" if routine.no_scalars() else "" - indent = " " * (12 + routine.length() + len(template)) + indent = " " * (21 + routine.length() + len(template)) result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL # Initialize OpenCL result += " auto device = get_device();" + NL - result += " auto context = Context(device);" + NL - result += " auto queue = Queue(context, device);" + NL + result += " auto context = clblast::Context(device);" + NL + result += " auto queue = clblast::Queue(context, device);" + NL # Set alpha and beta result += "".join(" " + s + NL for s in routine.scalar_create_cpp(flavour)) @@ -134,13 +134,13 @@ def clblast_blas_cc(routine): # The function call result += " auto queue_cl = queue();" + NL - result += " auto s = " + routine.name.capitalize() + template + "(" + result += " auto s = clblast::" + routine.name.capitalize() + template + "(" result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)]) result += "," + NL + indent + "&queue_cl);" + NL # Error handling - result += " if (s != StatusCode::kSuccess) {" + NL - result += " throw std::runtime_error(\"CLBlast returned with error code \" + ToString(s));" + NL + result += " if (s != clblast::StatusCode::kSuccess) {" + NL + result += " throw std::runtime_error(\"CLBlast returned with error code \" + clblast::ToString(s));" + NL result += " }" + NL # Copy back and clean-up diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 085845a8..097376ad 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -75,7 +75,7 @@ class Routine: @staticmethod def create_buffer(name, template): """Creates a new CLCudaAPI buffer""" - return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + name + "_size);" + return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);" @staticmethod def write_buffer(name, template): diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp index 9b59a20d..6cc14583 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_blas.cpp @@ -19,16 +19,16 @@ #include "clblast.h" #include "utilities/utilities.hpp" -namespace clblast { - -// ================================================================================================= +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; // Helper function to get a default OpenCL platform and device -Device get_device() { - auto platform_id = ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); - auto device_id = ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}); - auto platform = Platform(platform_id); - return Device(platform, device_id); +clblast::Device get_device() { + auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); + auto device_id = clblast::ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}); + auto platform = clblast::Platform(platform_id); + return clblast::Device(platform, device_id); } // ================================================================================================= @@ -41,28 +41,28 @@ void cblas_srotg(float* sa, float* sc, float* ss) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto sa_size = 1; const auto sb_size = 1; const auto sc_size = 1; const auto ss_size = 1; - auto sa_buffer = Buffer(context, sa_size); - auto sb_buffer = Buffer(context, sb_size); - auto sc_buffer = Buffer(context, sc_size); - auto ss_buffer = Buffer(context, ss_size); + auto sa_buffer = clblast::Buffer(context, sa_size); + auto sb_buffer = clblast::Buffer(context, sb_size); + auto sc_buffer = clblast::Buffer(context, sc_size); + auto ss_buffer = clblast::Buffer(context, ss_size); sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); auto queue_cl = queue(); - auto s = Rotg(sa_buffer(), 0, - sb_buffer(), 0, - sc_buffer(), 0, - ss_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); @@ -74,28 +74,28 @@ void cblas_drotg(double* sa, double* sc, double* ss) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto sa_size = 1; const auto sb_size = 1; const auto sc_size = 1; const auto ss_size = 1; - auto sa_buffer = Buffer(context, sa_size); - auto sb_buffer = Buffer(context, sb_size); - auto sc_buffer = Buffer(context, sc_size); - auto ss_buffer = Buffer(context, ss_size); + auto sa_buffer = clblast::Buffer(context, sa_size); + auto sb_buffer = clblast::Buffer(context, sb_size); + auto sc_buffer = clblast::Buffer(context, sc_size); + auto ss_buffer = clblast::Buffer(context, ss_size); sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); auto queue_cl = queue(); - auto s = Rotg(sa_buffer(), 0, - sb_buffer(), 0, - sc_buffer(), 0, - ss_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); @@ -110,32 +110,32 @@ void cblas_srotmg(float* sd1, const float* sy1, float* sparam) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto sy1_size = 1; const auto sd1_size = 1; const auto sd2_size = 1; const auto sx1_size = 1; const auto sparam_size = 1; - auto sy1_buffer = Buffer(context, sy1_size); - auto sd1_buffer = Buffer(context, sd1_size); - auto sd2_buffer = Buffer(context, sd2_size); - auto sx1_buffer = Buffer(context, sx1_size); - auto sparam_buffer = Buffer(context, sparam_size); + auto sy1_buffer = clblast::Buffer(context, sy1_size); + auto sd1_buffer = clblast::Buffer(context, sd1_size); + auto sd2_buffer = clblast::Buffer(context, sd2_size); + auto sx1_buffer = clblast::Buffer(context, sx1_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); - auto s = Rotmg(sd1_buffer(), 0, - sd2_buffer(), 0, - sx1_buffer(), 0, - sy1_buffer(), 0, - sparam_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); @@ -148,32 +148,32 @@ void cblas_drotmg(double* sd1, const double* sy1, double* sparam) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto sy1_size = 1; const auto sd1_size = 1; const auto sd2_size = 1; const auto sx1_size = 1; const auto sparam_size = 1; - auto sy1_buffer = Buffer(context, sy1_size); - auto sd1_buffer = Buffer(context, sd1_size); - auto sd2_buffer = Buffer(context, sd2_size); - auto sx1_buffer = Buffer(context, sx1_size); - auto sparam_buffer = Buffer(context, sparam_size); + auto sy1_buffer = clblast::Buffer(context, sy1_size); + auto sd1_buffer = clblast::Buffer(context, sd1_size); + auto sd2_buffer = clblast::Buffer(context, sd2_size); + auto sx1_buffer = clblast::Buffer(context, sx1_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); - auto s = Rotmg(sd1_buffer(), 0, - sd2_buffer(), 0, - sx1_buffer(), 0, - sy1_buffer(), 0, - sparam_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); @@ -188,23 +188,23 @@ void cblas_srot(const int n, const float cos, const float sin) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Rot(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - cos, - sin, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -215,23 +215,23 @@ void cblas_drot(const int n, const double cos, const double sin) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Rot(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - cos, - sin, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -243,25 +243,25 @@ void cblas_srotm(const int n, float* y, const int y_inc, float* sparam) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto sparam_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto sparam_buffer = Buffer(context, sparam_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); - auto s = Rotm(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - sparam_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -272,25 +272,25 @@ void cblas_drotm(const int n, double* y, const int y_inc, double* sparam) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto sparam_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto sparam_buffer = Buffer(context, sparam_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); - auto s = Rotm(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - sparam_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -302,21 +302,21 @@ void cblas_sswap(const int n, float* x, const int x_inc, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -325,21 +325,21 @@ void cblas_dswap(const int n, double* x, const int x_inc, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -348,21 +348,21 @@ void cblas_cswap(const int n, void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -371,21 +371,21 @@ void cblas_zswap(const int n, void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -396,19 +396,19 @@ void cblas_sscal(const int n, const float alpha, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); + auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -416,19 +416,19 @@ void cblas_dscal(const int n, const double alpha, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); + auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -436,19 +436,19 @@ void cblas_cscal(const int n, const void* alpha, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); + auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -456,19 +456,19 @@ void cblas_zscal(const int n, const void* alpha, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); + auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -478,21 +478,21 @@ void cblas_scopy(const int n, const float* x, const int x_inc, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -500,21 +500,21 @@ void cblas_dcopy(const int n, const double* x, const int x_inc, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -522,21 +522,21 @@ void cblas_ccopy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -544,21 +544,21 @@ void cblas_zcopy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -569,23 +569,23 @@ void cblas_saxpy(const int n, const float* x, const int x_inc, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -594,23 +594,23 @@ void cblas_daxpy(const int n, const double* x, const int x_inc, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -619,23 +619,23 @@ void cblas_caxpy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -644,23 +644,23 @@ void cblas_zaxpy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -671,25 +671,25 @@ void cblas_sdot(const int n, const float* x, const int x_inc, const float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dot(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -698,25 +698,25 @@ void cblas_ddot(const int n, const double* x, const int x_inc, const double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dot(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -727,25 +727,25 @@ void cblas_cdotu(const int n, const void* x, const int x_inc, const void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dotu(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -754,25 +754,25 @@ void cblas_zdotu(const int n, const void* x, const int x_inc, const void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dotu(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -783,25 +783,25 @@ void cblas_cdotc(const int n, const void* x, const int x_inc, const void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dotc(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -810,25 +810,25 @@ void cblas_zdotc(const int n, const void* x, const int x_inc, const void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dotc(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -838,21 +838,21 @@ void cblas_snrm2(const int n, float* nrm2, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto nrm2_size = 1; - auto x_buffer = Buffer(context, x_size); - auto nrm2_buffer = Buffer(context, nrm2_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); - auto s = Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); } @@ -860,21 +860,21 @@ void cblas_dnrm2(const int n, double* nrm2, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto nrm2_size = 1; - auto x_buffer = Buffer(context, x_size); - auto nrm2_buffer = Buffer(context, nrm2_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); - auto s = Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); } @@ -882,21 +882,21 @@ void cblas_scnrm2(const int n, void* nrm2, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto nrm2_size = 1; - auto x_buffer = Buffer(context, x_size); - auto nrm2_buffer = Buffer(context, nrm2_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); - auto s = Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); } @@ -904,21 +904,21 @@ void cblas_dznrm2(const int n, void* nrm2, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto nrm2_size = 1; - auto x_buffer = Buffer(context, x_size); - auto nrm2_buffer = Buffer(context, nrm2_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); - auto s = Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); } @@ -928,21 +928,21 @@ void cblas_sasum(const int n, float* asum, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto asum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto asum_buffer = Buffer(context, asum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); - auto s = Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); } @@ -950,21 +950,21 @@ void cblas_dasum(const int n, double* asum, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto asum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto asum_buffer = Buffer(context, asum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); - auto s = Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); } @@ -972,21 +972,21 @@ void cblas_scasum(const int n, void* asum, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto asum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto asum_buffer = Buffer(context, asum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); - auto s = Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); } @@ -994,21 +994,21 @@ void cblas_dzasum(const int n, void* asum, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto asum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto asum_buffer = Buffer(context, asum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); - auto s = Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); } @@ -1018,21 +1018,21 @@ void cblas_ssum(const int n, float* sum, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto sum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto sum_buffer = Buffer(context, sum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); - auto s = Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); } @@ -1040,21 +1040,21 @@ void cblas_dsum(const int n, double* sum, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto sum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto sum_buffer = Buffer(context, sum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); - auto s = Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); } @@ -1062,21 +1062,21 @@ void cblas_scsum(const int n, void* sum, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto sum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto sum_buffer = Buffer(context, sum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); - auto s = Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); } @@ -1084,21 +1084,21 @@ void cblas_dzsum(const int n, void* sum, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto sum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto sum_buffer = Buffer(context, sum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); - auto s = Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); } @@ -1108,21 +1108,21 @@ void cblas_isamax(const int n, float* imax, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1130,21 +1130,21 @@ void cblas_idamax(const int n, double* imax, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1152,21 +1152,21 @@ void cblas_icamax(const int n, void* imax, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1174,21 +1174,21 @@ void cblas_izamax(const int n, void* imax, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1198,21 +1198,21 @@ void cblas_ismax(const int n, float* imax, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1220,21 +1220,21 @@ void cblas_idmax(const int n, double* imax, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1242,21 +1242,21 @@ void cblas_icmax(const int n, void* imax, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1264,21 +1264,21 @@ void cblas_izmax(const int n, void* imax, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1288,21 +1288,21 @@ void cblas_ismin(const int n, float* imin, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imin_buffer = Buffer(context, imin_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); - auto s = Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); } @@ -1310,21 +1310,21 @@ void cblas_idmin(const int n, double* imin, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imin_buffer = Buffer(context, imin_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); - auto s = Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); } @@ -1332,21 +1332,21 @@ void cblas_icmin(const int n, void* imin, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imin_buffer = Buffer(context, imin_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); - auto s = Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); } @@ -1354,21 +1354,21 @@ void cblas_izmin(const int n, void* imin, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imin_buffer = Buffer(context, imin_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); - auto s = Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); } @@ -1386,31 +1386,31 @@ void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float beta, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1422,31 +1422,31 @@ void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double beta, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1458,31 +1458,31 @@ void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1494,31 +1494,31 @@ void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1532,31 +1532,31 @@ void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float beta, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1568,31 +1568,31 @@ void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double beta, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1604,31 +1604,31 @@ void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1640,31 +1640,31 @@ void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1678,31 +1678,31 @@ void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hemv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1714,31 +1714,31 @@ void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hemv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1752,31 +1752,31 @@ void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1788,31 +1788,31 @@ void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1826,31 +1826,31 @@ void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hpmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1862,31 +1862,31 @@ void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hpmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1900,31 +1900,31 @@ void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float beta, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Symv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1936,31 +1936,31 @@ void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double beta, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Symv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1974,31 +1974,31 @@ void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float beta, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Sbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -2010,31 +2010,31 @@ void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double beta, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Sbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -2048,31 +2048,31 @@ void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float beta, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Spmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -2084,31 +2084,31 @@ void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double beta, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Spmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -2119,25 +2119,25 @@ void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* a, const int a_ld, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2146,25 +2146,25 @@ void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* a, const int a_ld, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2173,25 +2173,25 @@ void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2200,25 +2200,25 @@ void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2229,25 +2229,25 @@ void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* a, const int a_ld, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2256,25 +2256,25 @@ void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* a, const int a_ld, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2283,25 +2283,25 @@ void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2310,25 +2310,25 @@ void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2339,25 +2339,25 @@ void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* ap, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2366,25 +2366,25 @@ void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* ap, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2393,25 +2393,25 @@ void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* ap, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2420,25 +2420,25 @@ void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* ap, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2449,25 +2449,25 @@ void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* a, const int a_ld, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2476,25 +2476,25 @@ void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* a, const int a_ld, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2503,25 +2503,25 @@ void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2530,25 +2530,25 @@ void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2559,25 +2559,25 @@ void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* a, const int a_ld, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2586,25 +2586,25 @@ void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* a, const int a_ld, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2613,25 +2613,25 @@ void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2640,25 +2640,25 @@ void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2669,25 +2669,25 @@ void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* ap, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2696,25 +2696,25 @@ void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* ap, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2723,25 +2723,25 @@ void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* ap, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2750,25 +2750,25 @@ void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* ap, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2781,28 +2781,28 @@ void cblas_sger(const CLBlastLayout layout, const float* y, const int y_inc, float* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Ger(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2813,28 +2813,28 @@ void cblas_dger(const CLBlastLayout layout, const double* y, const int y_inc, double* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Ger(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2847,28 +2847,28 @@ void cblas_cgeru(const CLBlastLayout layout, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Geru(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2879,28 +2879,28 @@ void cblas_zgeru(const CLBlastLayout layout, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Geru(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2913,28 +2913,28 @@ void cblas_cgerc(const CLBlastLayout layout, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Gerc(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2945,28 +2945,28 @@ void cblas_zgerc(const CLBlastLayout layout, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Gerc(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2978,25 +2978,25 @@ void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Her(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3006,25 +3006,25 @@ void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Her(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3036,25 +3036,25 @@ void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, void* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Hpr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3064,25 +3064,25 @@ void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, void* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Hpr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3095,29 +3095,29 @@ void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Her2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3128,29 +3128,29 @@ void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Her2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3163,29 +3163,29 @@ void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* y, const int y_inc, void* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Hpr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3196,29 +3196,29 @@ void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* y, const int y_inc, void* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Hpr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3230,25 +3230,25 @@ void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, float* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Syr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3258,25 +3258,25 @@ void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, double* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Syr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3288,25 +3288,25 @@ void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, float* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Spr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3316,25 +3316,25 @@ void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, double* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Spr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3347,29 +3347,29 @@ void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* y, const int y_inc, float* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Syr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3380,29 +3380,29 @@ void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* y, const int y_inc, double* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Syr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3415,29 +3415,29 @@ void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* y, const int y_inc, float* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Spr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3448,29 +3448,29 @@ void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* y, const int y_inc, double* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Spr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3488,32 +3488,32 @@ void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float beta, float* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3525,32 +3525,32 @@ void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double beta, double* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3562,32 +3562,32 @@ void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3599,32 +3599,32 @@ void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3638,32 +3638,32 @@ void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float beta, float* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3675,32 +3675,32 @@ void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double beta, double* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3712,32 +3712,32 @@ void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3749,32 +3749,32 @@ void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3788,32 +3788,32 @@ void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Hemm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3825,32 +3825,32 @@ void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Hemm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3863,28 +3863,28 @@ void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float beta, float* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3895,28 +3895,28 @@ void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double beta, double* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3927,28 +3927,28 @@ void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3959,28 +3959,28 @@ void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3993,28 +3993,28 @@ void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Herk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4025,28 +4025,28 @@ void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Herk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4060,32 +4060,32 @@ void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const float beta, float* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4097,32 +4097,32 @@ void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const double beta, double* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4134,32 +4134,32 @@ void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4171,32 +4171,32 @@ void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4210,32 +4210,32 @@ void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const float beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Her2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4247,32 +4247,32 @@ void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const double beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Her2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4284,28 +4284,28 @@ void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float* a, const int a_ld, float* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4315,28 +4315,28 @@ void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double* a, const int a_ld, double* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4346,28 +4346,28 @@ void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4377,28 +4377,28 @@ void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4410,28 +4410,28 @@ void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float* a, const int a_ld, float* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4441,28 +4441,28 @@ void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double* a, const int a_ld, double* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4472,28 +4472,28 @@ void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4503,28 +4503,28 @@ void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4540,25 +4540,25 @@ void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const float* a, const int a_ld, float* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4568,25 +4568,25 @@ void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const double* a, const int a_ld, double* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4596,25 +4596,25 @@ void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4624,28 +4624,27 @@ void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } // ================================================================================================= -} // namespace clblast From 8ae8ab06a2b6f24faa0de5d390a5ae272aa94c23 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 20:33:10 +0200 Subject: [PATCH 08/15] Renamed the include and source files of the Netlib CBLAS API --- CMakeLists.txt | 4 ++-- include/{clblast_blas.h => clblast_netlib_c.h} | 6 +++--- scripts/generator/generator.py | 12 ++++++------ scripts/generator/generator/cpp.py | 4 ++-- src/{clblast_blas.cpp => clblast_netlib_c.cpp} | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) rename include/{clblast_blas.h => clblast_netlib_c.h} (99%) rename src/{clblast_blas.cpp => clblast_netlib_c.cpp} (99%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1fff1a3a..aa1e287e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,7 +170,7 @@ set(SOURCES src/cache.cpp src/clblast.cpp src/clblast_c.cpp - src/clblast_blas.cpp + src/clblast_netlib_c.cpp src/routine.cpp ) foreach(ROUTINE ${LEVEL1_ROUTINES}) @@ -214,7 +214,7 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib) install(FILES include/clblast.h DESTINATION include) install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) -install(FILES include/clblast_blas.h DESTINATION include) +install(FILES include/clblast_netlib_c.h DESTINATION include) # Installs the config for find_package in dependent projects install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake) diff --git a/include/clblast_blas.h b/include/clblast_netlib_c.h similarity index 99% rename from include/clblast_blas.h rename to include/clblast_netlib_c.h index ff560712..c233646e 100644 --- a/include/clblast_blas.h +++ b/include/clblast_netlib_c.h @@ -13,8 +13,8 @@ // // ================================================================================================= -#ifndef CLBLAST_CLBLAST_BLAS_H_ -#define CLBLAST_CLBLAST_BLAS_H_ +#ifndef CLBLAST_CLBLAST_NETLIB_C_H_ +#define CLBLAST_CLBLAST_NETLIB_C_H_ // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx @@ -924,5 +924,5 @@ void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspo } // extern "C" #endif -// CLBLAST_CLBLAST_BLAS_H_ +// CLBLAST_CLBLAST_NETLIB_C_H_ #endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 65d40877..1a467340 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -12,8 +12,8 @@ # clblast.cpp # clblast_c.h # clblast_c.cpp -# clblast_blas.h -# clblast_blas.cpp +# clblast_netlib_c.h +# clblast_netlib_c.cpp # wrapper_clblas.h # wrapper_cblas.h # It also generates the main functions for the correctness and performance tests as found in @@ -38,8 +38,8 @@ FILES = [ "/src/clblast_c.cpp", "/test/wrapper_clblas.hpp", "/test/wrapper_cblas.hpp", - "/include/clblast_blas.h", - "/src/clblast_blas.cpp", + "/include/clblast_netlib_c.h", + "/src/clblast_netlib_c.cpp", ] HEADER_LINES = [117, 73, 118, 22, 29, 41, 47, 32] FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2] @@ -205,9 +205,9 @@ def main(argv): if i == 5: body += cpp.wrapper_cblas(routine) if i == 6: - body += cpp.clblast_blas_h(routine) + body += cpp.clblast_netlib_c_h(routine) if i == 7: - body += cpp.clblast_blas_cc(routine) + body += cpp.clblast_netlib_c_cc(routine) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 60e29a07..9d4ef6c4 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -95,7 +95,7 @@ def clblast_c_cc(routine): return result -def clblast_blas_h(routine): +def clblast_netlib_c_h(routine): """The Netlib CBLAS API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL for flavour in routine.flavours: @@ -104,7 +104,7 @@ def clblast_blas_h(routine): return result -def clblast_blas_cc(routine): +def clblast_netlib_c_cc(routine): """The Netlib CBLAS API implementation (.cpp)""" result = NL + "// " + routine.name.upper() + NL for flavour in routine.flavours: diff --git a/src/clblast_blas.cpp b/src/clblast_netlib_c.cpp similarity index 99% rename from src/clblast_blas.cpp rename to src/clblast_netlib_c.cpp index 6cc14583..203a3423 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_netlib_c.cpp @@ -15,7 +15,7 @@ #include -#include "clblast_blas.h" +#include "clblast_netlib_c.h" #include "clblast.h" #include "utilities/utilities.hpp" From bb14a5880efea3bb8a80a53bf45fc0c5378d5db6 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 20:37:33 +0200 Subject: [PATCH 09/15] Added an example and documentation for the Netlib CBLAS API --- CHANGELOG | 1 + CMakeLists.txt | 2 +- README.md | 4 +++ samples/sgemm_netlib.c | 69 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 samples/sgemm_netlib.c diff --git a/CHANGELOG b/CHANGELOG index 48305f03..efe614cb 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ Development version (next release) - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header - Changed the enums in the C API to avoid potential name clashes with external code +- Added a Netlib CBLAS compatible API (not recommended for full control over performance) - Greatly improved the way exceptions are handled in the library (thanks to 'intelfx') - Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation - Fixed a bug in the tests and samples related to waiting for an invalid event diff --git a/CMakeLists.txt b/CMakeLists.txt index aa1e287e..aaac87f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,7 +150,7 @@ endif() set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemm_direct xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) -set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) +set(SAMPLE_PROGRAMS_C sasum dgemv sgemm sgemm_netlib haxpy cache) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) diff --git a/README.md b/README.md index 9b289448..20a320d3 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,10 @@ Or alternatively the plain C version: #include +There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows: + + #include + Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows: cmake -DSAMPLES=ON .. diff --git a/samples/sgemm_netlib.c b/samples/sgemm_netlib.c new file mode 100644 index 00000000..0c8f76e9 --- /dev/null +++ b/samples/sgemm_netlib.c @@ -0,0 +1,69 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file demonstrates the use of the Netlib CBLAS API of the CLBlast library. This API is not +// recommended if you want full control over performance: it will internally copy buffers from and +// to the OpenCL device. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include +#include +#include + +// Includes the CLBlast library (Netlib CBLAS interface) +#include + +// ================================================================================================= + +// Example use of the single-precision routine SGEMM +int main(void) { + + // Example SGEMM arguments + const int m = 128; + const int n = 64; + const int k = 512; + const float alpha = 0.7f; + const float beta = 1.0f; + const int a_ld = k; + const int b_ld = n; + const int c_ld = n; + + // Populate host matrices with some example data + float* host_a = (float*)malloc(sizeof(float)*m*k); + float* host_b = (float*)malloc(sizeof(float)*n*k); + float* host_c = (float*)malloc(sizeof(float)*m*n); + for (int i=0; i Date: Sun, 20 Nov 2016 21:36:57 +0100 Subject: [PATCH 10/15] Made functions with scalar-buffers as output properly return values --- include/clblast_netlib_c.h | 162 ++++++------- scripts/generator/generator/cpp.py | 27 ++- scripts/generator/generator/routine.py | 18 +- src/clblast_netlib_c.cpp | 300 ++++++++++++------------- 4 files changed, 252 insertions(+), 255 deletions(-) diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h index c233646e..0a38abb2 100644 --- a/include/clblast_netlib_c.h +++ b/include/clblast_netlib_c.h @@ -155,118 +155,88 @@ void PUBLIC_API cblas_zaxpy(const int n, void* y, const int y_inc); // Dot product of two vectors: SDOT/DDOT/HDOT -void PUBLIC_API cblas_sdot(const int n, - float* dot, - const float* x, const int x_inc, - const float* y, const int y_inc); -void PUBLIC_API cblas_ddot(const int n, - double* dot, - const double* x, const int x_inc, - const double* y, const int y_inc); +float PUBLIC_API cblas_sdot(const int n, + const float* x, const int x_inc, + const float* y, const int y_inc); +double PUBLIC_API cblas_ddot(const int n, + const double* x, const int x_inc, + const double* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU -void PUBLIC_API cblas_cdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); -void PUBLIC_API cblas_zdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); +float PUBLIC_API cblas_cdotu(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc); +double PUBLIC_API cblas_zdotu(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -void PUBLIC_API cblas_cdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); -void PUBLIC_API cblas_zdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); +float PUBLIC_API cblas_cdotc(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc); +double PUBLIC_API cblas_zdotc(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 -void PUBLIC_API cblas_snrm2(const int n, - float* nrm2, - const float* x, const int x_inc); -void PUBLIC_API cblas_dnrm2(const int n, - double* nrm2, - const double* x, const int x_inc); -void PUBLIC_API cblas_scnrm2(const int n, - void* nrm2, - const void* x, const int x_inc); -void PUBLIC_API cblas_dznrm2(const int n, - void* nrm2, - const void* x, const int x_inc); +float PUBLIC_API cblas_snrm2(const int n, + const float* x, const int x_inc); +double PUBLIC_API cblas_dnrm2(const int n, + const double* x, const int x_inc); +float PUBLIC_API cblas_scnrm2(const int n, + const void* x, const int x_inc); +double PUBLIC_API cblas_dznrm2(const int n, + const void* x, const int x_inc); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM -void PUBLIC_API cblas_sasum(const int n, - float* asum, - const float* x, const int x_inc); -void PUBLIC_API cblas_dasum(const int n, - double* asum, - const double* x, const int x_inc); -void PUBLIC_API cblas_scasum(const int n, - void* asum, - const void* x, const int x_inc); -void PUBLIC_API cblas_dzasum(const int n, - void* asum, - const void* x, const int x_inc); +float PUBLIC_API cblas_sasum(const int n, + const float* x, const int x_inc); +double PUBLIC_API cblas_dasum(const int n, + const double* x, const int x_inc); +float PUBLIC_API cblas_scasum(const int n, + const void* x, const int x_inc); +double PUBLIC_API cblas_dzasum(const int n, + const void* x, const int x_inc); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM -void PUBLIC_API cblas_ssum(const int n, - float* sum, - const float* x, const int x_inc); -void PUBLIC_API cblas_dsum(const int n, - double* sum, - const double* x, const int x_inc); -void PUBLIC_API cblas_scsum(const int n, - void* sum, - const void* x, const int x_inc); -void PUBLIC_API cblas_dzsum(const int n, - void* sum, - const void* x, const int x_inc); +float PUBLIC_API cblas_ssum(const int n, + const float* x, const int x_inc); +double PUBLIC_API cblas_dsum(const int n, + const double* x, const int x_inc); +float PUBLIC_API cblas_scsum(const int n, + const void* x, const int x_inc); +double PUBLIC_API cblas_dzsum(const int n, + const void* x, const int x_inc); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX -void PUBLIC_API cblas_isamax(const int n, - float* imax, - const float* x, const int x_inc); -void PUBLIC_API cblas_idamax(const int n, - double* imax, - const double* x, const int x_inc); -void PUBLIC_API cblas_icamax(const int n, - void* imax, - const void* x, const int x_inc); -void PUBLIC_API cblas_izamax(const int n, - void* imax, - const void* x, const int x_inc); +int PUBLIC_API cblas_isamax(const int n, + const float* x, const int x_inc); +int PUBLIC_API cblas_idamax(const int n, + const double* x, const int x_inc); +int PUBLIC_API cblas_icamax(const int n, + const void* x, const int x_inc); +int PUBLIC_API cblas_izamax(const int n, + const void* x, const int x_inc); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX -void PUBLIC_API cblas_ismax(const int n, - float* imax, - const float* x, const int x_inc); -void PUBLIC_API cblas_idmax(const int n, - double* imax, - const double* x, const int x_inc); -void PUBLIC_API cblas_icmax(const int n, - void* imax, - const void* x, const int x_inc); -void PUBLIC_API cblas_izmax(const int n, - void* imax, - const void* x, const int x_inc); +int PUBLIC_API cblas_ismax(const int n, + const float* x, const int x_inc); +int PUBLIC_API cblas_idmax(const int n, + const double* x, const int x_inc); +int PUBLIC_API cblas_icmax(const int n, + const void* x, const int x_inc); +int PUBLIC_API cblas_izmax(const int n, + const void* x, const int x_inc); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN -void PUBLIC_API cblas_ismin(const int n, - float* imin, - const float* x, const int x_inc); -void PUBLIC_API cblas_idmin(const int n, - double* imin, - const double* x, const int x_inc); -void PUBLIC_API cblas_icmin(const int n, - void* imin, - const void* x, const int x_inc); -void PUBLIC_API cblas_izmin(const int n, - void* imin, - const void* x, const int x_inc); +int PUBLIC_API cblas_ismin(const int n, + const float* x, const int x_inc); +int PUBLIC_API cblas_idmin(const int n, + const double* x, const int x_inc); +int PUBLIC_API cblas_icmin(const int n, + const void* x, const int x_inc); +int PUBLIC_API cblas_izmin(const int n, + const void* x, const int x_inc); // ================================================================================================= // BLAS level-2 (matrix-vector) routines diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 9d4ef6c4..7b7ece22 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -100,7 +100,7 @@ def clblast_netlib_c_h(routine): result = NL + "// " + routine.description + ": " + routine.short_names() + NL for flavour in routine.flavours: if flavour.precision_name in ["S", "D", "C", "Z"]: - result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL + result += routine.routine_header_netlib(flavour, 20, " PUBLIC_API") + ";" + NL return result @@ -113,7 +113,7 @@ def clblast_netlib_c_cc(routine): if flavour.precision_name in ["S", "D", "C", "Z"]: template = "<" + flavour.template + ">" if routine.no_scalars() else "" indent = " " * (21 + routine.length() + len(template)) - result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL + result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL # Initialize OpenCL result += " auto device = get_device();" + NL @@ -127,10 +127,13 @@ def clblast_netlib_c_cc(routine): for i, name in enumerate(routine.inputs + routine.outputs): result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL for i, name in enumerate(routine.inputs + routine.outputs): - result += " " + routine.create_buffer(name, flavour.buffer_type) + NL + buffer_type = routine.get_buffer_type(name, flavour) + result += " " + routine.create_buffer(name, buffer_type) + NL for name in routine.inputs + routine.outputs: - prefix = "" if name in routine.outputs else "const " - result += " " + routine.write_buffer(name, prefix + flavour.buffer_type) + NL + if name not in routine.scalar_buffers_first(): + prefix = "" if name in routine.outputs else "const " + buffer_type = routine.get_buffer_type(name, flavour) + result += " " + routine.write_buffer(name, prefix + buffer_type) + NL # The function call result += " auto queue_cl = queue();" + NL @@ -145,7 +148,19 @@ def clblast_netlib_c_cc(routine): # Copy back and clean-up for name in routine.outputs: - result += " " + routine.read_buffer(name, flavour.buffer_type) + NL + if name in routine.scalar_buffers_first(): + buffer_type = routine.get_buffer_type(name, flavour) + result += " " + buffer_type + " " + name + "[" + name + "_size];" + NL + for name in routine.outputs: + buffer_type = routine.get_buffer_type(name, flavour) + result += " " + routine.read_buffer(name, buffer_type) + NL + for name in routine.outputs: + if name in routine.scalar_buffers_first(): + result += " return " + name + "[0]" + if flavour.buffer_type in ["float2", "double2"]: + if name not in routine.index_buffers(): + result += ".real()" + result += ";" + NL result += "}" + NL return result diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 097376ad..391cf3e0 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -109,6 +109,11 @@ class Routine: """List of buffers without 'inc' or 'ld'""" return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap"] + def get_buffer_type(self, name, flavour): + if name in self.index_buffers(): + return "int" + return flavour.buffer_type + def length(self): """Retrieves the number of characters in the routine's name""" return len(self.name) @@ -549,7 +554,6 @@ class Routine: def arguments_def_netlib(self, flavour): """As above, but for the Netlib CBLAS API""" return (self.options_def_c() + self.sizes_def_netlib() + - list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) + self.scalar_def_void("alpha", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) + self.scalar_def_void("beta", flavour) + @@ -645,8 +649,16 @@ class Routine: def routine_header_netlib(self, flavour, spaces, extra_qualifier): """As above, but now for the original Netlib CBLAS API""" - indent = " " * (spaces + self.length()) - result = "void" + extra_qualifier + " cblas_" + flavour.name.lower() + self.name + "(" + return_type = "void" + for output in self.outputs: + if output in self.index_buffers(): + return_type = "int" + break + if output in self.scalar_buffers_first(): + return_type = flavour.buffer_type.replace("2", "") + break + indent = " " * (spaces + len(return_type) + self.length()) + result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + self.name + "(" result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")" return result diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index 203a3423..efff1712 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -666,10 +666,9 @@ void cblas_zaxpy(const int n, } // DOT -void cblas_sdot(const int n, - float* dot, - const float* x, const int x_inc, - const float* y, const int y_inc) { +float cblas_sdot(const int n, + const float* x, const int x_inc, + const float* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -681,7 +680,6 @@ void cblas_sdot(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dot(n, dot_buffer(), 0, @@ -691,12 +689,13 @@ void cblas_sdot(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0]; } -void cblas_ddot(const int n, - double* dot, - const double* x, const int x_inc, - const double* y, const int y_inc) { +double cblas_ddot(const int n, + const double* x, const int x_inc, + const double* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -708,7 +707,6 @@ void cblas_ddot(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dot(n, dot_buffer(), 0, @@ -718,14 +716,15 @@ void cblas_ddot(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0]; } // DOTU -void cblas_cdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { +float cblas_cdotu(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -737,7 +736,6 @@ void cblas_cdotu(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dotu(n, dot_buffer(), 0, @@ -747,12 +745,13 @@ void cblas_cdotu(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0].real(); } -void cblas_zdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { +double cblas_zdotu(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -764,7 +763,6 @@ void cblas_zdotu(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dotu(n, dot_buffer(), 0, @@ -774,14 +772,15 @@ void cblas_zdotu(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0].real(); } // DOTC -void cblas_cdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { +float cblas_cdotc(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -793,7 +792,6 @@ void cblas_cdotc(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dotc(n, dot_buffer(), 0, @@ -803,12 +801,13 @@ void cblas_cdotc(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0].real(); } -void cblas_zdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { +double cblas_zdotc(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -820,7 +819,6 @@ void cblas_zdotc(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dotc(n, dot_buffer(), 0, @@ -830,13 +828,14 @@ void cblas_zdotc(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0].real(); } // NRM2 -void cblas_snrm2(const int n, - float* nrm2, - const float* x, const int x_inc) { +float cblas_snrm2(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -845,7 +844,6 @@ void cblas_snrm2(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, @@ -854,11 +852,12 @@ void cblas_snrm2(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); + return nrm2[0]; } -void cblas_dnrm2(const int n, - double* nrm2, - const double* x, const int x_inc) { +double cblas_dnrm2(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -867,7 +866,6 @@ void cblas_dnrm2(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, @@ -876,11 +874,12 @@ void cblas_dnrm2(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); + return nrm2[0]; } -void cblas_scnrm2(const int n, - void* nrm2, - const void* x, const int x_inc) { +float cblas_scnrm2(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -889,7 +888,6 @@ void cblas_scnrm2(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, @@ -898,11 +896,12 @@ void cblas_scnrm2(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float2 nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); + return nrm2[0].real(); } -void cblas_dznrm2(const int n, - void* nrm2, - const void* x, const int x_inc) { +double cblas_dznrm2(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -911,7 +910,6 @@ void cblas_dznrm2(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, @@ -920,13 +918,14 @@ void cblas_dznrm2(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double2 nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); + return nrm2[0].real(); } // ASUM -void cblas_sasum(const int n, - float* asum, - const float* x, const int x_inc) { +float cblas_sasum(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -935,7 +934,6 @@ void cblas_sasum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, @@ -944,11 +942,12 @@ void cblas_sasum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); + return asum[0]; } -void cblas_dasum(const int n, - double* asum, - const double* x, const int x_inc) { +double cblas_dasum(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -957,7 +956,6 @@ void cblas_dasum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, @@ -966,11 +964,12 @@ void cblas_dasum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); + return asum[0]; } -void cblas_scasum(const int n, - void* asum, - const void* x, const int x_inc) { +float cblas_scasum(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -979,7 +978,6 @@ void cblas_scasum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, @@ -988,11 +986,12 @@ void cblas_scasum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float2 asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); + return asum[0].real(); } -void cblas_dzasum(const int n, - void* asum, - const void* x, const int x_inc) { +double cblas_dzasum(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -1001,7 +1000,6 @@ void cblas_dzasum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, @@ -1010,13 +1008,14 @@ void cblas_dzasum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double2 asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); + return asum[0].real(); } // SUM -void cblas_ssum(const int n, - float* sum, - const float* x, const int x_inc) { +float cblas_ssum(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -1025,7 +1024,6 @@ void cblas_ssum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, @@ -1034,11 +1032,12 @@ void cblas_ssum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); + return sum[0]; } -void cblas_dsum(const int n, - double* sum, - const double* x, const int x_inc) { +double cblas_dsum(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -1047,7 +1046,6 @@ void cblas_dsum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, @@ -1056,11 +1054,12 @@ void cblas_dsum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); + return sum[0]; } -void cblas_scsum(const int n, - void* sum, - const void* x, const int x_inc) { +float cblas_scsum(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -1069,7 +1068,6 @@ void cblas_scsum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, @@ -1078,11 +1076,12 @@ void cblas_scsum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float2 sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); + return sum[0].real(); } -void cblas_dzsum(const int n, - void* sum, - const void* x, const int x_inc) { +double cblas_dzsum(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -1091,7 +1090,6 @@ void cblas_dzsum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, @@ -1100,22 +1098,22 @@ void cblas_dzsum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double2 sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); + return sum[0].real(); } // AMAX -void cblas_isamax(const int n, - float* imax, - const float* x, const int x_inc) { +int cblas_isamax(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, @@ -1124,20 +1122,20 @@ void cblas_isamax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_idamax(const int n, - double* imax, - const double* x, const int x_inc) { +int cblas_idamax(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, @@ -1146,20 +1144,20 @@ void cblas_idamax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_icamax(const int n, - void* imax, - const void* x, const int x_inc) { +int cblas_icamax(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, @@ -1168,20 +1166,20 @@ void cblas_icamax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_izamax(const int n, - void* imax, - const void* x, const int x_inc) { +int cblas_izamax(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, @@ -1190,22 +1188,22 @@ void cblas_izamax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } // MAX -void cblas_ismax(const int n, - float* imax, - const float* x, const int x_inc) { +int cblas_ismax(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, @@ -1214,20 +1212,20 @@ void cblas_ismax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_idmax(const int n, - double* imax, - const double* x, const int x_inc) { +int cblas_idmax(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, @@ -1236,20 +1234,20 @@ void cblas_idmax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_icmax(const int n, - void* imax, - const void* x, const int x_inc) { +int cblas_icmax(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, @@ -1258,20 +1256,20 @@ void cblas_icmax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_izmax(const int n, - void* imax, - const void* x, const int x_inc) { +int cblas_izmax(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, @@ -1280,22 +1278,22 @@ void cblas_izmax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } // MIN -void cblas_ismin(const int n, - float* imin, - const float* x, const int x_inc) { +int cblas_ismin(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, @@ -1304,20 +1302,20 @@ void cblas_ismin(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + return imin[0]; } -void cblas_idmin(const int n, - double* imin, - const double* x, const int x_inc) { +int cblas_idmin(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, @@ -1326,20 +1324,20 @@ void cblas_idmin(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + return imin[0]; } -void cblas_icmin(const int n, - void* imin, - const void* x, const int x_inc) { +int cblas_icmin(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, @@ -1348,20 +1346,20 @@ void cblas_icmin(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + return imin[0]; } -void cblas_izmin(const int n, - void* imin, - const void* x, const int x_inc) { +int cblas_izmin(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, @@ -1370,7 +1368,9 @@ void cblas_izmin(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + return imin[0]; } // ================================================================================================= From 26ca07148092b5d4fcb0e25190e07bf6acae25a3 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 22 Nov 2016 08:41:52 +0100 Subject: [PATCH 11/15] Minor changes to ensure full compatibility with the Netlib CBLAS API --- include/clblast_netlib_c.h | 50 ++++++++++++++++++-------- scripts/generator/generator.py | 2 +- scripts/generator/generator/cpp.py | 7 ++-- scripts/generator/generator/routine.py | 33 +++++++++++++---- src/clblast_netlib_c.cpp | 46 ++++++++++++------------ 5 files changed, 90 insertions(+), 48 deletions(-) diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h index 0a38abb2..b5577cfa 100644 --- a/include/clblast_netlib_c.h +++ b/include/clblast_netlib_c.h @@ -46,6 +46,24 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, CLBlastDiagonalUnit = 132 } CLBlastDiagonal; typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; +// For full compatibility with CBLAS +typedef CLBlastLayout CBLAS_ORDER; +typedef CLBlastTranspose CBLAS_TRANSPOSE; +typedef CLBlastTriangle CBLAS_UPLO; +typedef CLBlastDiagonal CBLAS_DIAG; +typedef CLBlastSide CBLAS_SIDE; +#define CblasRowMajor CLBlastLayoutRowMajor +#define CblasColMajor CLBlastLayoutColMajor +#define CblasNoTrans CLBlastTransposeNo +#define CblasTrans CLBlastTransposeYes +#define CblasConjTrans CLBlastTransposeConjugate +#define CblasUpper CLBlastTriangleUpper +#define CblasLower CLBlastTriangleLower +#define CblasNonUnit CLBlastDiagonalNonUnit +#define CblasUnit CLBlastDiagonalUnit +#define CblasLeft CLBlastSideLeft +#define CblasRight CLBlastSideRight + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= @@ -64,12 +82,12 @@ void PUBLIC_API cblas_drotg(double* sa, void PUBLIC_API cblas_srotmg(float* sd1, float* sd2, float* sx1, - const float* sy1, + const float sy1, float* sparam); void PUBLIC_API cblas_drotmg(double* sd1, double* sd2, double* sx1, - const double* sy1, + const double sy1, double* sparam); // Apply givens plane rotation: SROT/DROT @@ -163,20 +181,24 @@ double PUBLIC_API cblas_ddot(const int n, const double* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU -float PUBLIC_API cblas_cdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); -double PUBLIC_API cblas_zdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); +void PUBLIC_API cblas_cdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); +void PUBLIC_API cblas_zdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -float PUBLIC_API cblas_cdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); -double PUBLIC_API cblas_zdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); +void PUBLIC_API cblas_cdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); +void PUBLIC_API cblas_zdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 float PUBLIC_API cblas_snrm2(const int n, diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 1a467340..5f0bb0d4 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,7 +41,7 @@ FILES = [ "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", ] -HEADER_LINES = [117, 73, 118, 22, 29, 41, 47, 32] +HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32] FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2] # Different possibilities for requirements diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 7b7ece22..6bb3080f 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -112,6 +112,7 @@ def clblast_netlib_c_cc(routine): # There is a version available in CBLAS if flavour.precision_name in ["S", "D", "C", "Z"]: template = "<" + flavour.template + ">" if routine.no_scalars() else "" + name_postfix = "_sub" if routine.name in routine.routines_scalar_no_return() else "" indent = " " * (21 + routine.length() + len(template)) result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL @@ -129,6 +130,8 @@ def clblast_netlib_c_cc(routine): for i, name in enumerate(routine.inputs + routine.outputs): buffer_type = routine.get_buffer_type(name, flavour) result += " " + routine.create_buffer(name, buffer_type) + NL + if name in routine.scalar_buffers_second_non_pointer(): + result += " " + buffer_type + " " + name + "_vec[1]; " + name + "_vec[0] = " + name + ";" + NL for name in routine.inputs + routine.outputs: if name not in routine.scalar_buffers_first(): prefix = "" if name in routine.outputs else "const " @@ -148,14 +151,14 @@ def clblast_netlib_c_cc(routine): # Copy back and clean-up for name in routine.outputs: - if name in routine.scalar_buffers_first(): + if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return(): buffer_type = routine.get_buffer_type(name, flavour) result += " " + buffer_type + " " + name + "[" + name + "_size];" + NL for name in routine.outputs: buffer_type = routine.get_buffer_type(name, flavour) result += " " + routine.read_buffer(name, buffer_type) + NL for name in routine.outputs: - if name in routine.scalar_buffers_first(): + if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return(): result += " return " + name + "[0]" if flavour.buffer_type in ["float2", "double2"]: if name not in routine.index_buffers(): diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 391cf3e0..6fcce23b 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -42,6 +42,11 @@ class Routine: """List of scalar buffers""" return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"] + @staticmethod + def scalar_buffers_second_non_pointer(): + """As above, but these ones are not passed as pointers but as scalars instead""" + return ["sy1"] + @staticmethod def other_scalars(): """List of scalars other than alpha and beta""" @@ -67,6 +72,10 @@ class Routine: """Distinguish between vectors and matrices""" return ["a", "b", "c", "ap"] + @staticmethod + def routines_scalar_no_return(): + return ["dotu", "dotc"] + @staticmethod def set_size(name, size): """Sets the size of a buffer""" @@ -77,10 +86,12 @@ class Routine: """Creates a new CLCudaAPI buffer""" return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);" - @staticmethod - def write_buffer(name, template): + def write_buffer(self, name, template): """Writes to a CLCudaAPI buffer""" - data_structure = "reinterpret_cast<" + template + "*>(" + name + ")" + postfix = "" + if name in self.scalar_buffers_second_non_pointer(): + postfix = "_vec" + data_structure = "reinterpret_cast<" + template + "*>(" + name + postfix + ")" return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");" @staticmethod @@ -206,7 +217,8 @@ class Routine: prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: data_type = "void" if flavour.is_non_standard() else flavour.buffer_type - a = [prefix + data_type + "* " + name + ""] + pointer = "" if name in self.scalar_buffers_second_non_pointer() else "*" + a = [prefix + data_type + pointer + " " + name + ""] c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + c)] return [] @@ -553,13 +565,16 @@ class Routine: def arguments_def_netlib(self, flavour): """As above, but for the Netlib CBLAS API""" - return (self.options_def_c() + self.sizes_def_netlib() + + result=(self.options_def_c() + self.sizes_def_netlib() + self.scalar_def_void("alpha", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) + self.scalar_def_void("beta", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + if self.name in self.routines_scalar_no_return(): + result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) + return result def arguments_def_c(self, flavour): """As above, but for the C API""" @@ -654,11 +669,15 @@ class Routine: if output in self.index_buffers(): return_type = "int" break - if output in self.scalar_buffers_first(): + if output in self.scalar_buffers_first() and self.name not in self.routines_scalar_no_return(): return_type = flavour.buffer_type.replace("2", "") break indent = " " * (spaces + len(return_type) + self.length()) - result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + self.name + "(" + routine_name = self.name + if self.name in self.routines_scalar_no_return(): + routine_name += "_sub" + indent += " " + result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "(" result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")" return result diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index efff1712..66852e31 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -107,7 +107,7 @@ void cblas_drotg(double* sa, void cblas_srotmg(float* sd1, float* sd2, float* sx1, - const float* sy1, + const float sy1, float* sparam) { auto device = get_device(); auto context = clblast::Context(device); @@ -118,11 +118,12 @@ void cblas_srotmg(float* sd1, const auto sx1_size = 1; const auto sparam_size = 1; auto sy1_buffer = clblast::Buffer(context, sy1_size); + float sy1_vec[1]; sy1_vec[0] = sy1; auto sd1_buffer = clblast::Buffer(context, sd1_size); auto sd2_buffer = clblast::Buffer(context, sd2_size); auto sx1_buffer = clblast::Buffer(context, sx1_size); auto sparam_buffer = clblast::Buffer(context, sparam_size); - sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1_vec)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); @@ -145,7 +146,7 @@ void cblas_srotmg(float* sd1, void cblas_drotmg(double* sd1, double* sd2, double* sx1, - const double* sy1, + const double sy1, double* sparam) { auto device = get_device(); auto context = clblast::Context(device); @@ -156,11 +157,12 @@ void cblas_drotmg(double* sd1, const auto sx1_size = 1; const auto sparam_size = 1; auto sy1_buffer = clblast::Buffer(context, sy1_size); + double sy1_vec[1]; sy1_vec[0] = sy1; auto sd1_buffer = clblast::Buffer(context, sd1_size); auto sd2_buffer = clblast::Buffer(context, sd2_size); auto sx1_buffer = clblast::Buffer(context, sx1_size); auto sparam_buffer = clblast::Buffer(context, sparam_size); - sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1_vec)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); @@ -722,9 +724,10 @@ double cblas_ddot(const int n, } // DOTU -float cblas_cdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_cdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -745,13 +748,12 @@ float cblas_cdotu(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - float2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); - return dot[0].real(); } -double cblas_zdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_zdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -772,15 +774,14 @@ double cblas_zdotu(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - double2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); - return dot[0].real(); } // DOTC -float cblas_cdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_cdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -801,13 +802,12 @@ float cblas_cdotc(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - float2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); - return dot[0].real(); } -double cblas_zdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_zdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -828,9 +828,7 @@ double cblas_zdotc(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - double2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); - return dot[0].real(); } // NRM2 From 654b41bb2bd6c3c15a1a9013a19a14c9f11da95c Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 23 Nov 2016 21:29:16 +0100 Subject: [PATCH 12/15] Fixed a bug in the HSCAL routine --- src/routines/level1/xscal.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp index 17410f01..0521b1e5 100644 --- a/src/routines/level1/xscal.cpp +++ b/src/routines/level1/xscal.cpp @@ -55,12 +55,12 @@ void Xscal::DoScal(const size_t n, const T alpha, // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); } else { kernel.SetArgument(0, static_cast(n)); - kernel.SetArgument(1, alpha); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast(x_offset)); kernel.SetArgument(4, static_cast(x_inc)); From fa42befcc1e180555e164f4f7c1cf2c63d658baa Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 23 Nov 2016 21:33:35 +0100 Subject: [PATCH 13/15] Made compilation of the Netlib CBLAS API conditional --- CMakeLists.txt | 9 +++++++-- README.md | 8 ++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aaac87f2..422be4e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ option(SAMPLES "Enable compilation of the examples" OFF) option(TUNERS "Enable compilation of the tuners" OFF) option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF) option(TESTS "Enable compilation of the correctness tests" OFF) +option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF) # Compile in verbose mode with additional diagnostic messages option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF) @@ -170,9 +171,11 @@ set(SOURCES src/cache.cpp src/clblast.cpp src/clblast_c.cpp - src/clblast_netlib_c.cpp src/routine.cpp ) +if(NETLIB) + set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp) +endif() foreach(ROUTINE ${LEVEL1_ROUTINES}) set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp) endforeach() @@ -214,7 +217,9 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib) install(FILES include/clblast.h DESTINATION include) install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) -install(FILES include/clblast_netlib_c.h DESTINATION include) +if(NETLIB) + install(FILES include/clblast_netlib_c.h DESTINATION include) +endif() # Installs the config for find_package in dependent projects install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake) diff --git a/README.md b/README.md index 20a320d3..15b65c06 100644 --- a/README.md +++ b/README.md @@ -90,16 +90,16 @@ Or alternatively the plain C version: #include -There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows: - - #include - Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows: cmake -DSAMPLES=ON .. Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler. +There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake: + + #include + Using the tuners (optional) ------------- From 792cc8359fe96dd6a53064579b18f76d9e913f98 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 23 Nov 2016 22:00:20 +0100 Subject: [PATCH 14/15] Fixed a vector-size related bug in the CLBlast Netlib API --- scripts/generator/generator.py | 30 +++---- src/clblast_netlib_c.cpp | 144 ++++++++++++++++----------------- 2 files changed, 87 insertions(+), 87 deletions(-) diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 5f0bb0d4..35d902b7 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -101,21 +101,21 @@ ROUTINES = [ [ # Level 1: vector-vector Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"],"", "Apply givens plane rotation", "", []), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], ["n","n","1"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), - Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], [xn,yn], ["cos","sin"],"", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [xn,yn,"1"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [xn,yn], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], [xn], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index 66852e31..3fbabd43 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -192,8 +192,8 @@ void cblas_srot(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -219,8 +219,8 @@ void cblas_drot(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -247,8 +247,8 @@ void cblas_srotm(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto sparam_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -276,8 +276,8 @@ void cblas_drotm(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto sparam_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -306,8 +306,8 @@ void cblas_sswap(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -329,8 +329,8 @@ void cblas_dswap(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -352,8 +352,8 @@ void cblas_cswap(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -375,8 +375,8 @@ void cblas_zswap(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -401,7 +401,7 @@ void cblas_sscal(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); @@ -421,7 +421,7 @@ void cblas_dscal(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); @@ -441,7 +441,7 @@ void cblas_cscal(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); @@ -461,7 +461,7 @@ void cblas_zscal(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); @@ -482,8 +482,8 @@ void cblas_scopy(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -504,8 +504,8 @@ void cblas_dcopy(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -526,8 +526,8 @@ void cblas_ccopy(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -548,8 +548,8 @@ void cblas_zcopy(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -574,8 +574,8 @@ void cblas_saxpy(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -599,8 +599,8 @@ void cblas_daxpy(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -624,8 +624,8 @@ void cblas_caxpy(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -649,8 +649,8 @@ void cblas_zaxpy(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -674,8 +674,8 @@ float cblas_sdot(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -701,8 +701,8 @@ double cblas_ddot(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -731,8 +731,8 @@ void cblas_cdotu_sub(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -757,8 +757,8 @@ void cblas_zdotu_sub(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -785,8 +785,8 @@ void cblas_cdotc_sub(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -811,8 +811,8 @@ void cblas_zdotc_sub(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -837,7 +837,7 @@ float cblas_snrm2(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); @@ -859,7 +859,7 @@ double cblas_dnrm2(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); @@ -881,7 +881,7 @@ float cblas_scnrm2(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); @@ -903,7 +903,7 @@ double cblas_dznrm2(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); @@ -927,7 +927,7 @@ float cblas_sasum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); @@ -949,7 +949,7 @@ double cblas_dasum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); @@ -971,7 +971,7 @@ float cblas_scasum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); @@ -993,7 +993,7 @@ double cblas_dzasum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); @@ -1017,7 +1017,7 @@ float cblas_ssum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); @@ -1039,7 +1039,7 @@ double cblas_dsum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); @@ -1061,7 +1061,7 @@ float cblas_scsum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); @@ -1083,7 +1083,7 @@ double cblas_dzsum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); @@ -1107,7 +1107,7 @@ int cblas_isamax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1129,7 +1129,7 @@ int cblas_idamax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1151,7 +1151,7 @@ int cblas_icamax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1173,7 +1173,7 @@ int cblas_izamax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1197,7 +1197,7 @@ int cblas_ismax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1219,7 +1219,7 @@ int cblas_idmax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1241,7 +1241,7 @@ int cblas_icmax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1263,7 +1263,7 @@ int cblas_izmax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1287,7 +1287,7 @@ int cblas_ismin(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); @@ -1309,7 +1309,7 @@ int cblas_idmin(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); @@ -1331,7 +1331,7 @@ int cblas_icmin(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); @@ -1353,7 +1353,7 @@ int cblas_izmin(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); From 2ff3f77392dc7395abf03d3864c42ff894918889 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 23 Nov 2016 22:07:11 +0100 Subject: [PATCH 15/15] Made the Netlib SGEMM example also optionally compiled --- CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 422be4e7..246d006c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,7 +151,10 @@ endif() set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemm_direct xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) -set(SAMPLE_PROGRAMS_C sasum dgemv sgemm sgemm_netlib haxpy cache) +set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) +if(NETLIB) + set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) +endif() set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)