diff --git a/include/clblast_cuda.h b/include/clblast_cuda.h index c125c302..e28f68e5 100644 --- a/include/clblast_cuda.h +++ b/include/clblast_cuda.h @@ -103,7 +103,7 @@ StatusCode Rotg(CUdeviceptr sa_buffer, const size_t sa_offset, CUdeviceptr sb_buffer, const size_t sb_offset, CUdeviceptr sc_buffer, const size_t sc_offset, CUdeviceptr ss_buffer, const size_t ss_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Generate modified givens plane rotation: SROTMG/DROTMG template @@ -112,7 +112,7 @@ StatusCode Rotmg(CUdeviceptr sd1_buffer, const size_t sd1_offset, CUdeviceptr sx1_buffer, const size_t sx1_offset, const CUdeviceptr sy1_buffer, const size_t sy1_offset, CUdeviceptr sparam_buffer, const size_t sparam_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Apply givens plane rotation: SROT/DROT template @@ -121,7 +121,7 @@ StatusCode Rot(const size_t n, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, const T cos, const T sin, - CUstream* stream); + const CUcontext context, const CUdevice device); // Apply modified givens plane rotation: SROTM/DROTM template @@ -129,28 +129,28 @@ StatusCode Rotm(const size_t n, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr sparam_buffer, const size_t sparam_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template @@ -158,7 +158,7 @@ StatusCode Axpy(const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Dot product of two vectors: SDOT/DDOT/HDOT template @@ -166,7 +166,7 @@ StatusCode Dot(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Dot product of two complex vectors: CDOTU/ZDOTU template @@ -174,7 +174,7 @@ StatusCode Dotu(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template @@ -182,56 +182,56 @@ StatusCode Dotc(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, CUdeviceptr nrm2_buffer, const size_t nrm2_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, CUdeviceptr asum_buffer, const size_t asum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, CUdeviceptr sum_buffer, const size_t sum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN template StatusCode Amin(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -246,7 +246,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template @@ -257,7 +257,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template @@ -268,7 +268,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template @@ -279,7 +279,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template @@ -290,7 +290,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template @@ -301,7 +301,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template @@ -312,7 +312,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template @@ -323,7 +323,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template @@ -331,7 +331,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template @@ -339,7 +339,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const size_t k, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template @@ -347,7 +347,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr ap_buffer, const size_t ap_offset, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template @@ -355,7 +355,7 @@ StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template @@ -363,7 +363,7 @@ StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const size_t k, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template @@ -371,7 +371,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr ap_buffer, const size_t ap_offset, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream); + const CUcontext context, const CUdevice device); // General rank-1 matrix update: SGER/DGER/HGER template @@ -381,7 +381,7 @@ StatusCode Ger(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // General rank-1 complex matrix update: CGERU/ZGERU template @@ -391,7 +391,7 @@ StatusCode Geru(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template @@ -401,7 +401,7 @@ StatusCode Gerc(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian rank-1 matrix update: CHER/ZHER template @@ -410,7 +410,7 @@ StatusCode Her(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template @@ -419,7 +419,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian rank-2 matrix update: CHER2/ZHER2 template @@ -429,7 +429,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template @@ -439,7 +439,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template @@ -448,7 +448,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template @@ -457,7 +457,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template @@ -467,7 +467,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template @@ -477,7 +477,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -492,7 +492,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template @@ -503,7 +503,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template @@ -514,7 +514,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template @@ -524,7 +524,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Rank-K update of a hermitian matrix: CHERK/ZHERK template @@ -534,7 +534,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template @@ -545,7 +545,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template @@ -556,7 +556,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const U beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template @@ -565,7 +565,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template @@ -574,7 +574,7 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // ================================================================================================= // Extra non-BLAS routines (level-X) @@ -587,14 +587,14 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream); + const CUcontext context, const CUdevice device); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL template StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const CUdeviceptr im_buffer, const size_t im_offset, CUdeviceptr col_buffer, const size_t col_offset, - CUstream* stream); + const CUcontext context, const CUdevice device); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template @@ -603,7 +603,7 @@ StatusCode AxpyBatched(const size_t n, const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, - CUstream* stream); + const CUcontext context, const CUdevice device); // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED template @@ -615,7 +615,7 @@ StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const T const T *betas, CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, - CUstream* stream); + const CUcontext context, const CUdevice device); // ================================================================================================= diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index f1ee1959..5413906a 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -50,7 +50,12 @@ def clblast_cc(routine, cuda=False): if routine.implemented: result += routine.routine_header_cpp(12, "", cuda) + " {" + NL result += " try {" + NL - result += " auto queue_cpp = Queue(*queue);" + NL + if cuda: + result += " const auto context_cpp = Context(context);" + NL + result += " const auto device_cpp = Device(device);" + NL + result += " auto queue_cpp = Queue(context_cpp, device_cpp);" + NL + else: + result += " auto queue_cpp = Queue(*queue);" + NL result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL if routine.batched: result += " " + (NL + " ").join(routine.batched_transform_to_cpp()) + NL @@ -72,7 +77,7 @@ def clblast_cc(routine, cuda=False): result += ("," + NL + indent2).join([a for a in arguments]) result += "," + NL + indent2 if cuda: - result += "CUstream*" + result += "const CUcontext, const CUdevice" else: result += "cl_command_queue*, cl_event*" result += ");" + NL diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index c3c1f775..b6b55821 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -813,7 +813,7 @@ class Routine: result += (",\n" + indent).join([a for a in arguments]) result += ",\n" + indent if cuda: - result += "CUstream* stream" + result += "const CUcontext context, const CUdevice device" else: result += "cl_command_queue* queue, cl_event* event" + default_event result += ")" @@ -830,7 +830,7 @@ class Routine: result += (",\n" + indent).join([a for a in arguments]) result += ",\n" + indent if cuda: - result += "CUstream* stream" + result += "const CUcontext, const CUdevice" else: result += "cl_command_queue*, cl_event*" result += ")" diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp index 5f30d023..f9a24236 100644 --- a/src/clblast_cuda.cpp +++ b/src/clblast_cuda.cpp @@ -30,19 +30,19 @@ StatusCode Rotg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rotg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Generate modified givens plane rotation: SROTMG/DROTMG template @@ -51,7 +51,7 @@ StatusCode Rotmg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, @@ -59,13 +59,13 @@ template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rotmg(CUdeviceptr, const size_t, CUdeviceptr, const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Apply givens plane rotation: SROT/DROT template @@ -74,7 +74,7 @@ StatusCode Rot(const size_t, CUdeviceptr, const size_t, const size_t, const T, const T, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rot(const size_t, @@ -82,13 +82,13 @@ template StatusCode PUBLIC_API Rot(const size_t, CUdeviceptr, const size_t, const size_t, const float, const float, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rot(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, const double, const double, - CUstream*); + const CUcontext, const CUdevice); // Apply modified givens plane rotation: SROTM/DROTM template @@ -96,28 +96,30 @@ StatusCode Rotm(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotm(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Rotm(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xswap(queue_cpp, event); routine.DoSwap(n, Buffer(x_buffer), x_offset, x_inc, @@ -128,32 +130,34 @@ StatusCode Swap(const size_t n, template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Swap(const size_t, CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xscal(queue_cpp, event); routine.DoScal(n, alpha, @@ -164,32 +168,34 @@ StatusCode Scal(const size_t n, template StatusCode PUBLIC_API Scal(const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Scal(const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xcopy(queue_cpp, event); routine.DoCopy(n, Buffer(x_buffer), x_offset, x_inc, @@ -200,23 +206,23 @@ StatusCode Copy(const size_t n, template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Copy(const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template @@ -224,9 +230,11 @@ StatusCode Axpy(const size_t n, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xaxpy(queue_cpp, event); routine.DoAxpy(n, alpha, @@ -239,27 +247,27 @@ template StatusCode PUBLIC_API Axpy(const size_t, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Axpy(const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Dot product of two vectors: SDOT/DDOT/HDOT template @@ -267,9 +275,11 @@ StatusCode Dot(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xdot(queue_cpp, event); routine.DoDot(n, Buffer(dot_buffer), dot_offset, @@ -282,17 +292,17 @@ template StatusCode PUBLIC_API Dot(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dot(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dot(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Dot product of two complex vectors: CDOTU/ZDOTU template @@ -300,9 +310,11 @@ StatusCode Dotu(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xdotu(queue_cpp, event); routine.DoDotu(n, Buffer(dot_buffer), dot_offset, @@ -315,12 +327,12 @@ template StatusCode PUBLIC_API Dotu(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dotu(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template @@ -328,9 +340,11 @@ StatusCode Dotc(const size_t n, CUdeviceptr dot_buffer, const size_t dot_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xdotc(queue_cpp, event); routine.DoDotc(n, Buffer(dot_buffer), dot_offset, @@ -343,21 +357,23 @@ template StatusCode PUBLIC_API Dotc(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Dotc(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, CUdeviceptr nrm2_buffer, const size_t nrm2_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xnrm2(queue_cpp, event); routine.DoNrm2(n, Buffer(nrm2_buffer), nrm2_offset, @@ -368,32 +384,34 @@ StatusCode Nrm2(const size_t n, template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Nrm2(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, CUdeviceptr asum_buffer, const size_t asum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xasum(queue_cpp, event); routine.DoAsum(n, Buffer(asum_buffer), asum_offset, @@ -404,32 +422,34 @@ StatusCode Asum(const size_t n, template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Asum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, CUdeviceptr sum_buffer, const size_t sum_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsum(queue_cpp, event); routine.DoSum(n, Buffer(sum_buffer), sum_offset, @@ -440,32 +460,34 @@ StatusCode Sum(const size_t n, template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sum(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xamax(queue_cpp, event); routine.DoAmax(n, Buffer(imax_buffer), imax_offset, @@ -476,32 +498,34 @@ StatusCode Amax(const size_t n, template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amax(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN template StatusCode Amin(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xamin(queue_cpp, event); routine.DoAmin(n, Buffer(imin_buffer), imin_offset, @@ -512,32 +536,34 @@ StatusCode Amin(const size_t n, template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Amin(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, CUdeviceptr imax_buffer, const size_t imax_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xmax(queue_cpp, event); routine.DoMax(n, Buffer(imax_buffer), imax_offset, @@ -548,32 +574,34 @@ StatusCode Max(const size_t n, template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Max(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, CUdeviceptr imin_buffer, const size_t imin_offset, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xmin(queue_cpp, event); routine.DoMin(n, Buffer(imin_buffer), imin_offset, @@ -584,23 +612,23 @@ StatusCode Min(const size_t n, template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Min(const size_t, CUdeviceptr, const size_t, const CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -615,9 +643,11 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgemv(queue_cpp, event); routine.DoGemv(layout, a_transpose, m, n, @@ -636,7 +666,7 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double, @@ -644,7 +674,7 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const float2, @@ -652,7 +682,7 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double2, @@ -660,7 +690,7 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const half, @@ -668,7 +698,7 @@ template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template @@ -679,9 +709,11 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgbmv(queue_cpp, event); routine.DoGbmv(layout, a_transpose, m, n, kl, ku, @@ -700,7 +732,7 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double, @@ -708,7 +740,7 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const float2, @@ -716,7 +748,7 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double2, @@ -724,7 +756,7 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const half, @@ -732,7 +764,7 @@ template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template @@ -743,9 +775,11 @@ StatusCode Hemv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhemv(queue_cpp, event); routine.DoHemv(layout, triangle, n, @@ -764,7 +798,7 @@ template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const size_t, const double2, @@ -772,7 +806,7 @@ template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template @@ -783,9 +817,11 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhbmv(queue_cpp, event); routine.DoHbmv(layout, triangle, n, k, @@ -804,7 +840,7 @@ template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const size_t, const size_t, const double2, @@ -812,7 +848,7 @@ template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template @@ -823,9 +859,11 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhpmv(queue_cpp, event); routine.DoHpmv(layout, triangle, n, @@ -844,7 +882,7 @@ template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const size_t, const double2, @@ -852,7 +890,7 @@ template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template @@ -863,9 +901,11 @@ StatusCode Symv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsymv(queue_cpp, event); routine.DoSymv(layout, triangle, n, @@ -884,7 +924,7 @@ template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const double, @@ -892,7 +932,7 @@ template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const half, @@ -900,7 +940,7 @@ template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template @@ -911,9 +951,11 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsbmv(queue_cpp, event); routine.DoSbmv(layout, triangle, n, k, @@ -932,7 +974,7 @@ template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const double, @@ -940,7 +982,7 @@ template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const half, @@ -948,7 +990,7 @@ template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template @@ -959,9 +1001,11 @@ StatusCode Spmv(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const T beta, CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xspmv(queue_cpp, event); routine.DoSpmv(layout, triangle, n, @@ -980,7 +1024,7 @@ template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const double, @@ -988,7 +1032,7 @@ template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const half, @@ -996,7 +1040,7 @@ template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template @@ -1004,9 +1048,11 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrmv(queue_cpp, event); routine.DoTrmv(layout, triangle, a_transpose, diagonal, n, @@ -1019,27 +1065,27 @@ template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const T const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template @@ -1047,9 +1093,11 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const size_t k, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtbmv(queue_cpp, event); routine.DoTbmv(layout, triangle, a_transpose, diagonal, n, k, @@ -1062,27 +1110,27 @@ template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const T const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template @@ -1090,9 +1138,11 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr ap_buffer, const size_t ap_offset, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtpmv(queue_cpp, event); routine.DoTpmv(layout, triangle, a_transpose, diagonal, n, @@ -1105,27 +1155,27 @@ template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const T const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template @@ -1133,9 +1183,11 @@ StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_ const size_t n, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrsv(queue_cpp, event); routine.DoTrsv(layout, triangle, a_transpose, diagonal, n, @@ -1148,22 +1200,22 @@ template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const T const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template @@ -1171,29 +1223,29 @@ StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template @@ -1201,29 +1253,29 @@ StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream* stream) { + const CUcontext, const CUdevice) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // General rank-1 matrix update: SGER/DGER/HGER template @@ -1233,9 +1285,11 @@ StatusCode Ger(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xger(queue_cpp, event); routine.DoGer(layout, m, n, @@ -1252,21 +1306,21 @@ template StatusCode PUBLIC_API Ger(const Layout, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // General rank-1 complex matrix update: CGERU/ZGERU template @@ -1276,9 +1330,11 @@ StatusCode Geru(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgeru(queue_cpp, event); routine.DoGeru(layout, m, n, @@ -1295,14 +1351,14 @@ template StatusCode PUBLIC_API Geru(const Layout, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Geru(const Layout, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template @@ -1312,9 +1368,11 @@ StatusCode Gerc(const Layout layout, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgerc(queue_cpp, event); routine.DoGerc(layout, m, n, @@ -1331,14 +1389,14 @@ template StatusCode PUBLIC_API Gerc(const Layout, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gerc(const Layout, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian rank-1 matrix update: CHER/ZHER template @@ -1347,9 +1405,11 @@ StatusCode Her(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xher,T>(queue_cpp, event); routine.DoHer(layout, triangle, n, @@ -1364,13 +1424,13 @@ template StatusCode PUBLIC_API Her(const Layout, const Triangle, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Her(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template @@ -1379,9 +1439,11 @@ StatusCode Hpr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhpr,T>(queue_cpp, event); routine.DoHpr(layout, triangle, n, @@ -1396,13 +1458,13 @@ template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian rank-2 matrix update: CHER2/ZHER2 template @@ -1412,9 +1474,11 @@ StatusCode Her2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xher2(queue_cpp, event); routine.DoHer2(layout, triangle, n, @@ -1431,14 +1495,14 @@ template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template @@ -1448,9 +1512,11 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhpr2(queue_cpp, event); routine.DoHpr2(layout, triangle, n, @@ -1467,14 +1533,14 @@ template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template @@ -1483,9 +1549,11 @@ StatusCode Syr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyr(queue_cpp, event); routine.DoSyr(layout, triangle, n, @@ -1500,19 +1568,19 @@ template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template @@ -1521,9 +1589,11 @@ StatusCode Spr(const Layout layout, const Triangle triangle, const T alpha, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xspr(queue_cpp, event); routine.DoSpr(layout, triangle, n, @@ -1538,19 +1608,19 @@ template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template @@ -1560,9 +1630,11 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyr2(queue_cpp, event); routine.DoSyr2(layout, triangle, n, @@ -1579,21 +1651,21 @@ template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template @@ -1603,9 +1675,11 @@ StatusCode Spr2(const Layout layout, const Triangle triangle, const CUdeviceptr x_buffer, const size_t x_offset, const size_t x_inc, const CUdeviceptr y_buffer, const size_t y_offset, const size_t y_inc, CUdeviceptr ap_buffer, const size_t ap_offset, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xspr2(queue_cpp, event); routine.DoSpr2(layout, triangle, n, @@ -1622,21 +1696,21 @@ template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -1651,9 +1725,11 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xgemm(queue_cpp, event); routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, @@ -1672,7 +1748,7 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double, @@ -1680,7 +1756,7 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2, @@ -1688,7 +1764,7 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2, @@ -1696,7 +1772,7 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, cons const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half, @@ -1704,7 +1780,7 @@ template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const T const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template @@ -1715,9 +1791,11 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsymm(queue_cpp, event); routine.DoSymm(layout, side, triangle, m, n, @@ -1736,7 +1814,7 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Trian const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double, @@ -1744,7 +1822,7 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Tria const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, @@ -1752,7 +1830,7 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Tria const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, @@ -1760,7 +1838,7 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Tri const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const half, @@ -1768,7 +1846,7 @@ template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triang const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template @@ -1779,9 +1857,11 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xhemm(queue_cpp, event); routine.DoHemm(layout, side, triangle, m, n, @@ -1800,7 +1880,7 @@ template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Tria const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, @@ -1808,7 +1888,7 @@ template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Tri const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template @@ -1818,9 +1898,11 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyrk(queue_cpp, event); routine.DoSyrk(layout, triangle, a_transpose, n, k, @@ -1837,35 +1919,35 @@ template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const T const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Rank-K update of a hermitian matrix: CHERK/ZHERK template @@ -1875,9 +1957,11 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xherk,T>(queue_cpp, event); routine.DoHerk(layout, triangle, a_transpose, n, k, @@ -1894,14 +1978,14 @@ template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const T const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template @@ -1912,9 +1996,11 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const T beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xsyr2k(queue_cpp, event); routine.DoSyr2k(layout, triangle, ab_transpose, n, k, @@ -1933,7 +2019,7 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, @@ -1941,7 +2027,7 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, @@ -1949,7 +2035,7 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const const CUdeviceptr, const size_t, const size_t, const float2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, @@ -1957,7 +2043,7 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, cons const CUdeviceptr, const size_t, const size_t, const double2, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, @@ -1965,7 +2051,7 @@ template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const T const CUdeviceptr, const size_t, const size_t, const half, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template @@ -1976,9 +2062,11 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a const CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, const U beta, CUdeviceptr c_buffer, const size_t c_offset, const size_t c_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xher2k(queue_cpp, event); routine.DoHer2k(layout, triangle, ab_transpose, n, k, @@ -1997,7 +2085,7 @@ template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const CUdeviceptr, const size_t, const size_t, const float, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, @@ -2005,7 +2093,7 @@ template StatusCode PUBLIC_API Her2k(const Layout, const Triangl const CUdeviceptr, const size_t, const size_t, const double, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template @@ -2014,9 +2102,11 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrmm(queue_cpp, event); routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, @@ -2031,31 +2121,31 @@ template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Trian const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template @@ -2064,9 +2154,11 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xtrsm(queue_cpp, event); routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, @@ -2081,25 +2173,25 @@ template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Trian const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // ================================================================================================= // Extra non-BLAS routines (level-X) @@ -2112,9 +2204,11 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const T alpha, const CUdeviceptr a_buffer, const size_t a_offset, const size_t a_ld, CUdeviceptr b_buffer, const size_t b_offset, const size_t b_ld, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xomatcopy(queue_cpp, event); routine.DoOmatcopy(layout, a_transpose, m, n, @@ -2129,40 +2223,42 @@ template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const float, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const float2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double2, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const half, const CUdeviceptr, const size_t, const size_t, CUdeviceptr, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL template StatusCode Im2col(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const CUdeviceptr im_buffer, const size_t im_offset, CUdeviceptr col_buffer, const size_t col_offset, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = Xim2col(queue_cpp, event); routine.DoIm2col(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, Buffer(im_buffer), im_offset, @@ -2173,23 +2269,23 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const CUdeviceptr, const size_t, CUdeviceptr, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template @@ -2198,9 +2294,11 @@ StatusCode AxpyBatched(const size_t n, const CUdeviceptr x_buffer, const size_t *x_offsets, const size_t x_inc, CUdeviceptr y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = XaxpyBatched(queue_cpp, event); auto alphas_cpp = std::vector(); auto x_offsets_cpp = std::vector(); @@ -2223,31 +2321,31 @@ template StatusCode PUBLIC_API AxpyBatched(const size_t, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const float2*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double2*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API AxpyBatched(const size_t, const half*, const CUdeviceptr, const size_t*, const size_t, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED template @@ -2259,9 +2357,11 @@ StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const T const T *betas, CUdeviceptr c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count, - CUstream* stream) { + const CUcontext context, const CUdevice device) { try { - auto queue_cpp = Queue(*queue); + const auto context_cpp = Context(context); + const auto device_cpp = Device(device); + auto queue_cpp = Queue(context_cpp, device_cpp); auto routine = XgemmBatched(queue_cpp, event); auto alphas_cpp = std::vector(); auto betas_cpp = std::vector(); @@ -2294,7 +2394,7 @@ template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const float*, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double*, @@ -2303,7 +2403,7 @@ template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose const double*, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2*, @@ -2312,7 +2412,7 @@ template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose const float2*, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2*, @@ -2321,7 +2421,7 @@ template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpos const double2*, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half*, @@ -2330,7 +2430,7 @@ template StatusCode PUBLIC_API GemmBatched(const Layout, const Transpose, const half*, CUdeviceptr, const size_t*, const size_t, const size_t, - CUstream*); + const CUcontext, const CUdevice); // ================================================================================================= } // namespace clblast diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp index a5b6be4b..fd071434 100644 --- a/src/utilities/buffer_test.hpp +++ b/src/utilities/buffer_test.hpp @@ -15,7 +15,7 @@ #ifndef CLBLAST_BUFFER_TEST_H_ #define CLBLAST_BUFFER_TEST_H_ -#include "utilities/utilities.hpp +#include "utilities/utilities.hpp" namespace clblast { // =================================================================================================