From 2776d761768295b01a8be7c333dbb337805d7f77 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 5 May 2018 14:06:33 +0200 Subject: [PATCH] Added interface of batched convolution as GEMM --- doc/api.md | 8 +-- include/clblast.h | 8 +++ include/clblast_c.h | 27 +++++++++ include/clblast_cuda.h | 8 +++ scripts/generator/generator.py | 12 +++- scripts/generator/generator/routine.py | 15 +++-- src/clblast.cpp | 35 ++++++++++++ src/clblast_c.cpp | 77 ++++++++++++++++++++++++++ src/clblast_cuda.cpp | 35 ++++++++++++ 9 files changed, 213 insertions(+), 12 deletions(-) diff --git a/doc/api.md b/doc/api.md index a60e16ce..2f861ce0 100644 --- a/doc/api.md +++ b/doc/api.md @@ -3063,10 +3063,10 @@ Arguments to IM2COL: * `const size_t stride_w`: Integer size argument. This value must be positive. * `const size_t dilation_h`: Integer size argument. This value must be positive. * `const size_t dilation_w`: Integer size argument. This value must be positive. -* `const cl_mem im_buffer`: OpenCL buffer to store the input im vector. -* `const size_t im_offset`: The offset in elements from the start of the input im vector. -* `cl_mem col_buffer`: OpenCL buffer to store the output col vector. -* `const size_t col_offset`: The offset in elements from the start of the output col vector. +* `const cl_mem im_buffer`: OpenCL buffer to store the input im tensor. +* `const size_t im_offset`: The offset in elements from the start of the input im tensor. +* `cl_mem col_buffer`: OpenCL buffer to store the output col tensor. +* `const size_t col_offset`: The offset in elements from the start of the output col tensor. * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. diff --git a/include/clblast.h b/include/clblast.h index ce64b37a..3e65f52a 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -636,6 +636,14 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event = nullptr); +// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM +template +StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event = nullptr); + // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template StatusCode AxpyBatched(const size_t n, diff --git a/include/clblast_c.h b/include/clblast_c.h index 23a3afcc..918c25f6 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -1410,6 +1410,33 @@ CLBlastStatusCode PUBLIC_API CLBlastHim2col(const size_t channels, const size_t cl_mem col_buffer, const size_t col_offset, cl_command_queue* queue, cl_event* event); +// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM +CLBlastStatusCode PUBLIC_API CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastDconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastCconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastZconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastHconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event); + // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n, const float *alphas, diff --git a/include/clblast_cuda.h b/include/clblast_cuda.h index d82ee331..01044037 100644 --- a/include/clblast_cuda.h +++ b/include/clblast_cuda.h @@ -608,6 +608,14 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width CUdeviceptr col_buffer, const size_t col_offset, const CUcontext context, const CUdevice device); +// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM +template +StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const CUdeviceptr im_buffer, const size_t im_offset, + const CUdeviceptr kernel_buffer, const size_t kernel_offset, + CUdeviceptr result_buffer, const size_t result_offset, + const CUcontext context, const CUdevice device); + // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template StatusCode AxpyBatched(const size_t n, diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 32420962..e2837dd5 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -50,7 +50,7 @@ FILES = [ "/src/pyclblast/src/pyclblast.pyx" ] HEADER_LINES = [123, 21, 127, 24, 29, 41, 29, 65, 32, 95, 21, 290] -FOOTER_LINES = [98, 56, 112, 275, 6, 6, 6, 9, 2, 41, 55, 37] +FOOTER_LINES = [98, 57, 112, 275, 6, 6, 6, 9, 2, 41, 56, 37] HEADER_LINES_DOC = 0 FOOTER_LINES_DOC = 232 @@ -106,11 +106,16 @@ ammn = size_helper("layout == CLBlastLayoutRowMajor", "m", "((side == CLBlastSid bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft) ? m : n)", "n", "b_ld") im = "height * width * channels" col = "height * width * channels" +imb = "height * width * channels * batch_count" +kernel = "kernel_h * kernel_w * num_kernels" +result = "height_out * width_out * num_kernels * batch_count" + # ================================================================================================== # Populates a list of routines im2col_constants = ["channels", "height", "width", "kernel_h", "kernel_w", "pad_h", "pad_w", "stride_h", "stride_w", "dilation_h", "dilation_w"] +convgemm_constants = im2col_constants + ["num_kernels", "batch_count"] ROUTINES = [ [ # Level 1: vector-vector Routine(False, True, 0, False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), @@ -176,6 +181,7 @@ ROUTINES = [ Routine(True, True, 0, False, "x", "had", T, [S,D,C,Z,H], ["n"], [], ["x","y"], ["z"], [xn,yn,zn], ["alpha","beta"], "", "Element-wise vector product (Hadamard)", "Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants.", []), Routine(True, True, 0, False, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), Routine(True, True, 0, False, "x", "im2col", T, [S,D,C,Z,H], im2col_constants, [], ["im"], ["col"], [im,col], [""], "", "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.", []), + Routine(False, True, 0, False, "x", "convgemm", T, [S,D,C,Z,H], convgemm_constants, [], ["im","kernel"], ["result"], [imb,kernel,result],[""], "", "Batched convolution as GEMM (non-BLAS function)", "Integrates im2col and GEMM for batched convolution, in which _im_ is the 4D input tensor, _kernel_ the 3D kernel weights tensor, and _result_ the 4D output tensor.", []), # Batched routines: Routine(True, True, 1, False, "x", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []), Routine(True, True, 1, False, "x", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "Batched version of GEMM", "As GEMM, but multiple operations are batched together for better performance.", [ald_transa_m_k, bld_transb_k_n, cld_m]), @@ -230,10 +236,10 @@ def main(argv): if i == 6: body += cpp.wrapper_cublas(routine) if i == 7: - if routine.batched == 0: + if routine.batched == 0 and routine.name not in ["convgemm"]: body += cpp.clblast_netlib_c_h(routine) if i == 8: - if routine.batched == 0: + if routine.batched == 0 and routine.name not in ["convgemm"]: body += cpp.clblast_netlib_c_cc(routine) if i == 9: body += cpp.clblast_h(routine, cuda=True) diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 317c8e7b..7321349d 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -141,6 +141,11 @@ class Routine: """Distinguish between vectors and matrices""" return ["a", "b", "c", "ap"] + @staticmethod + def buffers_tensor(): + """Distinguish between vectors and matrices and tensors""" + return ["im", "col", "kernel", "result"] + @staticmethod def routines_scalar_no_return(): return ["dotu", "dotc"] @@ -187,7 +192,7 @@ class Routine: def buffers_without_ld_inc(self): """List of buffers without 'inc' or 'ld'""" - return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap", "im", "col"] + return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap", "im", "col", "kernel", "result"] def get_buffer_type(self, name, flavour): if name in self.index_buffers(): @@ -200,7 +205,7 @@ class Routine: def no_scalars(self): """Determines whether or not this routine has scalar arguments (alpha/beta)""" - return self.scalars == [] or self.name == "im2col" + return self.scalars == [] or self.name in ["im2col", "convgemm"] def has_layout(self): """Determines whether the layout is an argument""" @@ -221,12 +226,12 @@ class Routine: """Determines which buffers go first (between alpha and beta) and which ones go after""" if self.level == "2b" or self.name == "had": return ["x", "y"] - return ["ap", "a", "b", "x", "im"] + return ["ap", "a", "b", "x", "im", "kernel"] def buffers_second(self): if self.level == "2b" or self.name == "had": return ["z", "ap", "a", "b", "c"] - return ["y", "c", "col"] + return ["y", "c", "col", "result"] def buffer(self, name): """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')""" @@ -397,7 +402,7 @@ class Routine: prefix = "const " if (name in self.inputs) else "" inout = "input" if (name in self.inputs) else "output" if (name in self.inputs) or (name in self.outputs): - math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector" + math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " tensor" if (name in self.buffers_tensor()) else name + " vector" inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment " a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."] b = ["`const size_t " + self.b_star() + name + "_offset" + self.b_s() + "`: The offset" + self.b_s() + " in elements from the start of the " + inout + " " + math_name + "."] diff --git a/src/clblast.cpp b/src/clblast.cpp index 10bb8cba..026285bb 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -2252,6 +2252,41 @@ template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const si cl_mem, const size_t, cl_command_queue*, cl_event*); +// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM +template +StatusCode Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const cl_mem, const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const cl_mem, const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const cl_mem, const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const cl_mem, const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const cl_mem, const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const cl_mem, const size_t, + const cl_mem, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); + // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template StatusCode AxpyBatched(const size_t n, diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index 06a5fc67..27f0c936 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -3679,6 +3679,83 @@ CLBlastStatusCode CLBlastHim2col(const size_t channels, const size_t height, con } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } } +// CONVGEMM +CLBlastStatusCode CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event) { + try { + return static_cast( + clblast::Convgemm(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, + im_buffer, im_offset, + kernel_buffer, kernel_offset, + result_buffer, result_offset, + queue, event) + ); + } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastDconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event) { + try { + return static_cast( + clblast::Convgemm(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, + im_buffer, im_offset, + kernel_buffer, kernel_offset, + result_buffer, result_offset, + queue, event) + ); + } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastCconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event) { + try { + return static_cast( + clblast::Convgemm(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, + im_buffer, im_offset, + kernel_buffer, kernel_offset, + result_buffer, result_offset, + queue, event) + ); + } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastZconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event) { + try { + return static_cast( + clblast::Convgemm(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, + im_buffer, im_offset, + kernel_buffer, kernel_offset, + result_buffer, result_offset, + queue, event) + ); + } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastHconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, + const cl_mem im_buffer, const size_t im_offset, + const cl_mem kernel_buffer, const size_t kernel_offset, + cl_mem result_buffer, const size_t result_offset, + cl_command_queue* queue, cl_event* event) { + try { + return static_cast( + clblast::Convgemm(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count, + im_buffer, im_offset, + kernel_buffer, kernel_offset, + result_buffer, result_offset, + queue, event) + ); + } catch (...) { return static_cast(clblast::DispatchExceptionForC()); } +} + // AXPY CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, const float *alphas, diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp index 8927014b..f89fb77d 100644 --- a/src/clblast_cuda.cpp +++ b/src/clblast_cuda.cpp @@ -2350,6 +2350,41 @@ template StatusCode PUBLIC_API Im2col(const size_t, const size_t, const si CUdeviceptr, const size_t, const CUcontext, const CUdevice); +// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM +template +StatusCode Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice) { + return StatusCode::kNotImplemented; +} +template StatusCode PUBLIC_API Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); +template StatusCode PUBLIC_API Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, + const CUdeviceptr, const size_t, + const CUdeviceptr, const size_t, + CUdeviceptr, const size_t, + const CUcontext, const CUdevice); + // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template StatusCode AxpyBatched(const size_t n,