From 2776d761768295b01a8be7c333dbb337805d7f77 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 5 May 2018 14:06:33 +0200
Subject: [PATCH] Added interface of batched convolution as GEMM

---
 doc/api.md                             |  8 +--
 include/clblast.h                      |  8 +++
 include/clblast_c.h                    | 27 +++++++++
 include/clblast_cuda.h                 |  8 +++
 scripts/generator/generator.py         | 12 +++-
 scripts/generator/generator/routine.py | 15 +++--
 src/clblast.cpp                        | 35 ++++++++++++
 src/clblast_c.cpp                      | 77 ++++++++++++++++++++++++++
 src/clblast_cuda.cpp                   | 35 ++++++++++++
 9 files changed, 213 insertions(+), 12 deletions(-)
diff --git a/doc/api.md b/doc/api.md
index a60e16ce..2f861ce0 100644
--- a/doc/api.md
+++ b/doc/api.md
@@ -3063,10 +3063,10 @@ Arguments to IM2COL:
 * `const size_t stride_w`: Integer size argument. This value must be positive.
 * `const size_t dilation_h`: Integer size argument. This value must be positive.
 * `const size_t dilation_w`: Integer size argument. This value must be positive.
-* `const cl_mem im_buffer`: OpenCL buffer to store the input im vector.
-* `const size_t im_offset`: The offset in elements from the start of the input im vector.
-* `cl_mem col_buffer`: OpenCL buffer to store the output col vector.
-* `const size_t col_offset`: The offset in elements from the start of the output col vector.
+* `const cl_mem im_buffer`: OpenCL buffer to store the input im tensor.
+* `const size_t im_offset`: The offset in elements from the start of the input im tensor.
+* `cl_mem col_buffer`: OpenCL buffer to store the output col tensor.
+* `const size_t col_offset`: The offset in elements from the start of the output col tensor.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
diff --git a/include/clblast.h b/include/clblast.h
index ce64b37a..3e65f52a 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -636,6 +636,14 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width
                   cl_mem col_buffer, const size_t col_offset,
                   cl_command_queue* queue, cl_event* event = nullptr);
 
+// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM
+template <typename T>
+StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                    const cl_mem im_buffer, const size_t im_offset,
+                    const cl_mem kernel_buffer, const size_t kernel_offset,
+                    cl_mem result_buffer, const size_t result_offset,
+                    cl_command_queue* queue, cl_event* event = nullptr);
+
 // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
 template <typename T>
 StatusCode AxpyBatched(const size_t n,
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 23a3afcc..918c25f6 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -1410,6 +1410,33 @@ CLBlastStatusCode PUBLIC_API CLBlastHim2col(const size_t channels, const size_t
                                             cl_mem col_buffer, const size_t col_offset,
                                             cl_command_queue* queue, cl_event* event);
 
+// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM
+CLBlastStatusCode PUBLIC_API CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                              const cl_mem im_buffer, const size_t im_offset,
+                                              const cl_mem kernel_buffer, const size_t kernel_offset,
+                                              cl_mem result_buffer, const size_t result_offset,
+                                              cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                              const cl_mem im_buffer, const size_t im_offset,
+                                              const cl_mem kernel_buffer, const size_t kernel_offset,
+                                              cl_mem result_buffer, const size_t result_offset,
+                                              cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                              const cl_mem im_buffer, const size_t im_offset,
+                                              const cl_mem kernel_buffer, const size_t kernel_offset,
+                                              cl_mem result_buffer, const size_t result_offset,
+                                              cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                              const cl_mem im_buffer, const size_t im_offset,
+                                              const cl_mem kernel_buffer, const size_t kernel_offset,
+                                              cl_mem result_buffer, const size_t result_offset,
+                                              cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                              const cl_mem im_buffer, const size_t im_offset,
+                                              const cl_mem kernel_buffer, const size_t kernel_offset,
+                                              cl_mem result_buffer, const size_t result_offset,
+                                              cl_command_queue* queue, cl_event* event);
+
 // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
 CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n,
                                                  const float *alphas,
diff --git a/include/clblast_cuda.h b/include/clblast_cuda.h
index d82ee331..01044037 100644
--- a/include/clblast_cuda.h
+++ b/include/clblast_cuda.h
@@ -608,6 +608,14 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width
                   CUdeviceptr col_buffer, const size_t col_offset,
                   const CUcontext context, const CUdevice device);
 
+// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM
+template <typename T>
+StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                    const CUdeviceptr im_buffer, const size_t im_offset,
+                    const CUdeviceptr kernel_buffer, const size_t kernel_offset,
+                    CUdeviceptr result_buffer, const size_t result_offset,
+                    const CUcontext context, const CUdevice device);
+
 // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
 template <typename T>
 StatusCode AxpyBatched(const size_t n,
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 32420962..e2837dd5 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -50,7 +50,7 @@ FILES = [
     "/src/pyclblast/src/pyclblast.pyx"
 ]
 HEADER_LINES = [123, 21, 127, 24, 29, 41, 29, 65, 32, 95, 21, 290]
-FOOTER_LINES = [98, 56, 112, 275, 6, 6, 6, 9, 2, 41, 55, 37]
+FOOTER_LINES = [98, 57, 112, 275, 6, 6, 6, 9, 2, 41, 56, 37]
 HEADER_LINES_DOC = 0
 FOOTER_LINES_DOC = 232
 
@@ -106,11 +106,16 @@ ammn = size_helper("layout == CLBlastLayoutRowMajor", "m", "((side == CLBlastSid
 bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft) ? m : n)", "n", "b_ld")
 im = "height * width * channels"
 col = "height * width * channels"
+imb = "height * width * channels * batch_count"
+kernel = "kernel_h * kernel_w * num_kernels"
+result = "height_out * width_out * num_kernels * batch_count"
+
 
 # ==================================================================================================
 
 # Populates a list of routines
 im2col_constants = ["channels", "height", "width", "kernel_h", "kernel_w", "pad_h", "pad_w", "stride_h", "stride_w", "dilation_h", "dilation_w"]
+convgemm_constants = im2col_constants + ["num_kernels", "batch_count"]
 ROUTINES = [
 [  # Level 1: vector-vector
   Routine(False, True,  0, False, "1", "rotg",  T, [S,D],            [],                  [],                                                     [],         ["sa","sb","sc","ss"],        ["1","1","1","1"], [],       "",    "Generate givens plane rotation", "", []),
@@ -176,6 +181,7 @@ ROUTINES = [
   Routine(True,  True,  0, False, "x", "had",      T, [S,D,C,Z,H],   ["n"],                [],                                                    ["x","y"],  ["z"],                        [xn,yn,zn],      ["alpha","beta"], "",    "Element-wise vector product (Hadamard)", "Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants.", []),
   Routine(True,  True,  0, False, "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
   Routine(True,  True,  0, False, "x", "im2col",   T, [S,D,C,Z,H],   im2col_constants,     [],                                                    ["im"],     ["col"],                      [im,col],        [""],             "",    "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.", []),
+  Routine(False, True,  0, False, "x", "convgemm", T, [S,D,C,Z,H],   convgemm_constants,   [],                                                    ["im","kernel"], ["result"],              [imb,kernel,result],[""],          "",    "Batched convolution as GEMM (non-BLAS function)", "Integrates im2col and GEMM for batched convolution, in which _im_ is the 4D input tensor, _kernel_ the 3D kernel weights tensor, and _result_ the 4D output tensor.", []),
   # Batched routines:
   Routine(True,  True,  1, False, "x", "axpy",     T, [S,D,C,Z,H],   ["n"],                [],                                                    ["x"],      ["y"],                        [xn,yn],         ["alpha"],        "",    "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []),
   Routine(True,  True,  1, False, "x", "gemm",     T, [S,D,C,Z,H],   ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        [amk,bkn,cmn],   ["alpha","beta"], "",    "Batched version of GEMM", "As GEMM, but multiple operations are batched together for better performance.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
@@ -230,10 +236,10 @@ def main(argv):
                     if i == 6:
                         body += cpp.wrapper_cublas(routine)
                     if i == 7:
-                        if routine.batched == 0:
+                        if routine.batched == 0 and routine.name not in ["convgemm"]:
                             body += cpp.clblast_netlib_c_h(routine)
                     if i == 8:
-                        if routine.batched == 0:
+                        if routine.batched == 0 and routine.name not in ["convgemm"]:
                             body += cpp.clblast_netlib_c_cc(routine)
                     if i == 9:
                         body += cpp.clblast_h(routine, cuda=True)
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 317c8e7b..7321349d 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -141,6 +141,11 @@ class Routine:
         """Distinguish between vectors and matrices"""
         return ["a", "b", "c", "ap"]
 
+    @staticmethod
+    def buffers_tensor():
+        """Distinguish between vectors and matrices and tensors"""
+        return ["im", "col", "kernel", "result"]
+
     @staticmethod
     def routines_scalar_no_return():
         return ["dotu", "dotc"]
@@ -187,7 +192,7 @@ class Routine:
 
     def buffers_without_ld_inc(self):
         """List of buffers without 'inc' or 'ld'"""
-        return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap", "im", "col"]
+        return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap", "im", "col", "kernel", "result"]
 
     def get_buffer_type(self, name, flavour):
         if name in self.index_buffers():
@@ -200,7 +205,7 @@ class Routine:
 
     def no_scalars(self):
         """Determines whether or not this routine has scalar arguments (alpha/beta)"""
-        return self.scalars == [] or self.name == "im2col"
+        return self.scalars == [] or self.name in ["im2col", "convgemm"]
 
     def has_layout(self):
         """Determines whether the layout is an argument"""
@@ -221,12 +226,12 @@ class Routine:
         """Determines which buffers go first (between alpha and beta) and which ones go after"""
         if self.level == "2b" or self.name == "had":
             return ["x", "y"]
-        return ["ap", "a", "b", "x", "im"]
+        return ["ap", "a", "b", "x", "im", "kernel"]
 
     def buffers_second(self):
         if self.level == "2b" or self.name == "had":
             return ["z", "ap", "a", "b", "c"]
-        return ["y", "c", "col"]
+        return ["y", "c", "col", "result"]
 
     def buffer(self, name):
         """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
@@ -397,7 +402,7 @@ class Routine:
         prefix = "const " if (name in self.inputs) else ""
         inout = "input" if (name in self.inputs) else "output"
         if (name in self.inputs) or (name in self.outputs):
-            math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector"
+            math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " tensor" if (name in self.buffers_tensor()) else name + " vector"
             inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment "
             a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."]
             b = ["`const size_t " + self.b_star() + name + "_offset" + self.b_s() + "`: The offset" + self.b_s() + " in elements from the start of the " + inout + " " + math_name + "."]
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 10bb8cba..026285bb 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -2252,6 +2252,41 @@ template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const si
                                             cl_mem, const size_t,
                                             cl_command_queue*, cl_event*);
 
+// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM
+template <typename T>
+StatusCode Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                    const cl_mem, const size_t,
+                    const cl_mem, const size_t,
+                    cl_mem, const size_t,
+                    cl_command_queue*, cl_event*) {
+  return StatusCode::kNotImplemented;
+}
+template StatusCode PUBLIC_API Convgemm<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                               const cl_mem, const size_t,
+                                               const cl_mem, const size_t,
+                                               cl_mem, const size_t,
+                                               cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Convgemm<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                                const cl_mem, const size_t,
+                                                const cl_mem, const size_t,
+                                                cl_mem, const size_t,
+                                                cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Convgemm<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                                const cl_mem, const size_t,
+                                                const cl_mem, const size_t,
+                                                cl_mem, const size_t,
+                                                cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Convgemm<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                                 const cl_mem, const size_t,
+                                                 const cl_mem, const size_t,
+                                                 cl_mem, const size_t,
+                                                 cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Convgemm<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                              const cl_mem, const size_t,
+                                              const cl_mem, const size_t,
+                                              cl_mem, const size_t,
+                                              cl_command_queue*, cl_event*);
+
 // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
 template <typename T>
 StatusCode AxpyBatched(const size_t n,
diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp
index 06a5fc67..27f0c936 100644
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@@ -3679,6 +3679,83 @@ CLBlastStatusCode CLBlastHim2col(const size_t channels, const size_t height, con
   } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
 
+// CONVGEMM
+CLBlastStatusCode CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                   const cl_mem im_buffer, const size_t im_offset,
+                                   const cl_mem kernel_buffer, const size_t kernel_offset,
+                                   cl_mem result_buffer, const size_t result_offset,
+                                   cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Convgemm<float>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count,
+                               im_buffer, im_offset,
+                               kernel_buffer, kernel_offset,
+                               result_buffer, result_offset,
+                               queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                   const cl_mem im_buffer, const size_t im_offset,
+                                   const cl_mem kernel_buffer, const size_t kernel_offset,
+                                   cl_mem result_buffer, const size_t result_offset,
+                                   cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Convgemm<double>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count,
+                                im_buffer, im_offset,
+                                kernel_buffer, kernel_offset,
+                                result_buffer, result_offset,
+                                queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                   const cl_mem im_buffer, const size_t im_offset,
+                                   const cl_mem kernel_buffer, const size_t kernel_offset,
+                                   cl_mem result_buffer, const size_t result_offset,
+                                   cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Convgemm<float2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count,
+                                im_buffer, im_offset,
+                                kernel_buffer, kernel_offset,
+                                result_buffer, result_offset,
+                                queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                   const cl_mem im_buffer, const size_t im_offset,
+                                   const cl_mem kernel_buffer, const size_t kernel_offset,
+                                   cl_mem result_buffer, const size_t result_offset,
+                                   cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Convgemm<double2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count,
+                                 im_buffer, im_offset,
+                                 kernel_buffer, kernel_offset,
+                                 result_buffer, result_offset,
+                                 queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                   const cl_mem im_buffer, const size_t im_offset,
+                                   const cl_mem kernel_buffer, const size_t kernel_offset,
+                                   cl_mem result_buffer, const size_t result_offset,
+                                   cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Convgemm<half>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, num_kernels, batch_count,
+                              im_buffer, im_offset,
+                              kernel_buffer, kernel_offset,
+                              result_buffer, result_offset,
+                              queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
 // AXPY
 CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
                                       const float *alphas,
diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp
index 8927014b..f89fb77d 100644
--- a/src/clblast_cuda.cpp
+++ b/src/clblast_cuda.cpp
@@ -2350,6 +2350,41 @@ template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const si
                                             CUdeviceptr, const size_t,
                                             const CUcontext, const CUdevice);
 
+// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/CCONVGEMM/ZCONVGEMM/HCONVGEMM
+template <typename T>
+StatusCode Convgemm(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                    const CUdeviceptr, const size_t,
+                    const CUdeviceptr, const size_t,
+                    CUdeviceptr, const size_t,
+                    const CUcontext, const CUdevice) {
+  return StatusCode::kNotImplemented;
+}
+template StatusCode PUBLIC_API Convgemm<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                               const CUdeviceptr, const size_t,
+                                               const CUdeviceptr, const size_t,
+                                               CUdeviceptr, const size_t,
+                                               const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Convgemm<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                                const CUdeviceptr, const size_t,
+                                                const CUdeviceptr, const size_t,
+                                                CUdeviceptr, const size_t,
+                                                const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Convgemm<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                                const CUdeviceptr, const size_t,
+                                                const CUdeviceptr, const size_t,
+                                                CUdeviceptr, const size_t,
+                                                const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Convgemm<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                                 const CUdeviceptr, const size_t,
+                                                 const CUdeviceptr, const size_t,
+                                                 CUdeviceptr, const size_t,
+                                                 const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Convgemm<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                              const CUdeviceptr, const size_t,
+                                              const CUdeviceptr, const size_t,
+                                              CUdeviceptr, const size_t,
+                                              const CUcontext, const CUdevice);
+
 // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
 template <typename T>
 StatusCode AxpyBatched(const size_t n,