Added groundwork for col2im algorithm plus first non-working version of kernel and test

2018-10-23 20:52:25 +02:00 · 2018-10-23 20:52:25 +02:00 · d45911b61d
parent 44b630fc22
commit d45911b61d
19 changed files with 814 additions and 4 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -221,7 +221,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                    xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
-set(LEVELX_ROUTINES xhad xomatcopy xim2col xconvgemm xaxpybatched xgemmbatched xgemmstridedbatched)
+set(LEVELX_ROUTINES xhad xomatcopy xim2col xcol2im xconvgemm xaxpybatched xgemmbatched xgemmstridedbatched)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464 16)

--- a/doc/api.md
+++ b/doc/api.md
@ -3072,6 +3072,66 @@ Arguments to IM2COL:



+xCOL2IM: Col2im function (non-BLAS function)
+-------------
+
+Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix.
+
+C++ API:
+```
+template <typename T>
+StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                  const cl_mem col_buffer, const size_t col_offset,
+                  cl_mem im_buffer, const size_t im_offset,
+                  cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastScol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to COL2IM:
+
+* `const size_t channels`: Integer size argument. This value must be positive.
+* `const size_t height`: Integer size argument. This value must be positive.
+* `const size_t width`: Integer size argument. This value must be positive.
+* `const size_t kernel_h`: Integer size argument. This value must be positive.
+* `const size_t kernel_w`: Integer size argument. This value must be positive.
+* `const size_t pad_h`: Integer size argument. This value must be positive.
+* `const size_t pad_w`: Integer size argument. This value must be positive.
+* `const size_t stride_h`: Integer size argument. This value must be positive.
+* `const size_t stride_w`: Integer size argument. This value must be positive.
+* `const size_t dilation_h`: Integer size argument. This value must be positive.
+* `const size_t dilation_w`: Integer size argument. This value must be positive.
+* `const cl_mem col_buffer`: OpenCL buffer to store the input col tensor.
+* `const size_t col_offset`: The offset in elements from the start of the input col tensor.
+* `cl_mem im_buffer`: OpenCL buffer to store the output im tensor.
+* `const size_t im_offset`: The offset in elements from the start of the output im tensor.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
 xCONVGEMM: Batched convolution as GEMM (non-BLAS function)
 -------------

--- a/include/clblast.h
+++ b/include/clblast.h
@ -636,6 +636,13 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width
                  cl_mem col_buffer, const size_t col_offset,
                  cl_command_queue* queue, cl_event* event = nullptr);

+// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
+template <typename T>
+StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                  const cl_mem col_buffer, const size_t col_offset,
+                  cl_mem im_buffer, const size_t im_offset,
+                  cl_command_queue* queue, cl_event* event = nullptr);
+
 // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
 template <typename T>
 StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@ -1410,6 +1410,28 @@ CLBlastStatusCode PUBLIC_API CLBlastHim2col(const size_t channels, const size_t
                                            cl_mem col_buffer, const size_t col_offset,
                                            cl_command_queue* queue, cl_event* event);

+// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
+CLBlastStatusCode PUBLIC_API CLBlastScol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                            const cl_mem col_buffer, const size_t col_offset,
+                                            cl_mem im_buffer, const size_t im_offset,
+                                            cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastDcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                            const cl_mem col_buffer, const size_t col_offset,
+                                            cl_mem im_buffer, const size_t im_offset,
+                                            cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastCcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                            const cl_mem col_buffer, const size_t col_offset,
+                                            cl_mem im_buffer, const size_t im_offset,
+                                            cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastZcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                            const cl_mem col_buffer, const size_t col_offset,
+                                            cl_mem im_buffer, const size_t im_offset,
+                                            cl_command_queue* queue, cl_event* event);
+CLBlastStatusCode PUBLIC_API CLBlastHcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                            const cl_mem col_buffer, const size_t col_offset,
+                                            cl_mem im_buffer, const size_t im_offset,
+                                            cl_command_queue* queue, cl_event* event);
+
 // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
 CLBlastStatusCode PUBLIC_API CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
                                              const cl_mem im_buffer, const size_t im_offset,
--- a/include/clblast_cuda.h
+++ b/include/clblast_cuda.h
@ -608,6 +608,13 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width
                  CUdeviceptr col_buffer, const size_t col_offset,
                  const CUcontext context, const CUdevice device);

+// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
+template <typename T>
+StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                  const CUdeviceptr col_buffer, const size_t col_offset,
+                  CUdeviceptr im_buffer, const size_t im_offset,
+                  const CUcontext context, const CUdevice device);
+
 // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
 template <typename T>
 StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
--- a/include/clblast_netlib_c.h
+++ b/include/clblast_netlib_c.h
@ -960,6 +960,20 @@ void PUBLIC_API cblas_zim2col(const int channels, const int height, const int wi
                              const void* im,
                              void* col);

+// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
+void PUBLIC_API cblas_scol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const float* col,
+                              float* im);
+void PUBLIC_API cblas_dcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const double* col,
+                              double* im);
+void PUBLIC_API cblas_ccol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const void* col,
+                              void* im);
+void PUBLIC_API cblas_zcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const void* col,
+                              void* im);
+
 // =================================================================================================

 #ifdef __cplusplus
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -181,6 +181,7 @@ ROUTINES = [
  Routine(True,  True,  0, False, "x", "had",      T, [S,D,C,Z,H],   ["n"],                [],                                                    ["x","y"],  ["z"],                        [xn,yn,zn],      ["alpha","beta"], "",    "Element-wise vector product (Hadamard)", "Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants.", []),
  Routine(True,  True,  0, False, "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
  Routine(True,  True,  0, False, "x", "im2col",   T, [S,D,C,Z,H],   im2col_constants,     [],                                                    ["im"],     ["col"],                      [im,col],        [""],             "",    "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.", []),
+  Routine(True,  True,  0, False, "x", "col2im",   T, [S,D,C,Z,H],   im2col_constants,     [],                                                    ["col"],    ["im"],                       [col,im],        [""],             "",    "Col2im function (non-BLAS function)", "Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix.", []),
  Routine(True,  True,  0, False, "x", "convgemm", T, [S,D,H],       convgemm_constants,   [],                                                    ["im","kernel"], ["result"],              [imb,kernel,result],[""],          "",    "Batched convolution as GEMM (non-BLAS function)", "Integrates im2col and GEMM for batched 3D convolution, in which _im_ is the 4D input tensor (NCHW - batch-channelin-height-width), _kernel_ the 4D kernel weights tensor (KCHW - channelout-channelin-height-width), and _result_ the 4D output tensor (NCHW - batch-channelout-height-width).", []),
  # Batched routines:
  Routine(True,  True,  1, False, "x", "axpy",     T, [S,D,C,Z,H],   ["n"],                [],                                                    ["x"],      ["y"],                        [xn,yn],         ["alpha"],        "",    "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []),
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@ -205,7 +205,7 @@ class Routine:

    def no_scalars(self):
        """Determines whether or not this routine has scalar arguments (alpha/beta)"""
-        return self.scalars == [] or self.name in ["im2col", "convgemm"]
+        return self.scalars == [] or self.name in ["im2col", "col2im", "convgemm"]

    def has_layout(self):
        """Determines whether the layout is an argument"""
@ -226,12 +226,14 @@ class Routine:
        """Determines which buffers go first (between alpha and beta) and which ones go after"""
        if self.level == "2b" or self.name == "had":
            return ["x", "y"]
-        return ["ap", "a", "b", "x", "im", "kernel"]
+        extra_buffer = "col" if self.name == "col2im" else "im"
+        return ["ap", "a", "b", "x", extra_buffer, "kernel"]

    def buffers_second(self):
        if self.level == "2b" or self.name == "had":
            return ["z", "ap", "a", "b", "c"]
-        return ["y", "c", "col", "result"]
+        extra_buffer = "im" if self.name == "col2im" else "col"
+        return ["y", "c", extra_buffer, "result"]

    def buffer(self, name):
        """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@ -2252,6 +2252,42 @@ template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const si
                                            cl_mem, const size_t,
                                            cl_command_queue*, cl_event*);

+// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
+template <typename T>
+StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                  const cl_mem col_buffer, const size_t col_offset,
+                  cl_mem im_buffer, const size_t im_offset,
+                  cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = Xcol2im<T>(queue_cpp, event);
+    routine.DoCol2im(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                     Buffer<T>(col_buffer), col_offset,
+                     Buffer<T>(im_buffer), im_offset);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Col2im<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                             const cl_mem, const size_t,
+                                             cl_mem, const size_t,
+                                             cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Col2im<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                              const cl_mem, const size_t,
+                                              cl_mem, const size_t,
+                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Col2im<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                              const cl_mem, const size_t,
+                                              cl_mem, const size_t,
+                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Col2im<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                               const cl_mem, const size_t,
+                                               cl_mem, const size_t,
+                                               cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Col2im<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                            const cl_mem, const size_t,
+                                            cl_mem, const size_t,
+                                            cl_command_queue*, cl_event*);
+
 // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
 template <typename T>
 StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@ -3679,6 +3679,73 @@ CLBlastStatusCode CLBlastHim2col(const size_t channels, const size_t height, con
  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }

+// COL2IM
+CLBlastStatusCode CLBlastScol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Col2im<float>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                             col_buffer, col_offset,
+                             im_buffer, im_offset,
+                             queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastDcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Col2im<double>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                              col_buffer, col_offset,
+                              im_buffer, im_offset,
+                              queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastCcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Col2im<float2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                              col_buffer, col_offset,
+                              im_buffer, im_offset,
+                              queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastZcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Col2im<double2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                               col_buffer, col_offset,
+                               im_buffer, im_offset,
+                               queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+CLBlastStatusCode CLBlastHcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event) {
+  try {
+    return static_cast<CLBlastStatusCode>(
+      clblast::Col2im<half>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                            col_buffer, col_offset,
+                            im_buffer, im_offset,
+                            queue, event)
+    );
+  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
+}
+
 // CONVGEMM
 CLBlastStatusCode CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
                                   const cl_mem im_buffer, const size_t im_offset,
--- a/src/clblast_cuda.cpp
+++ b/src/clblast_cuda.cpp
@ -2350,6 +2350,44 @@ template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const si
                                            CUdeviceptr, const size_t,
                                            const CUcontext, const CUdevice);

+// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
+template <typename T>
+StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                  const CUdeviceptr col_buffer, const size_t col_offset,
+                  CUdeviceptr im_buffer, const size_t im_offset,
+                  const CUcontext context, const CUdevice device) {
+  try {
+    const auto context_cpp = Context(context);
+    const auto device_cpp = Device(device);
+    auto queue_cpp = Queue(context_cpp, device_cpp);
+    auto routine = Xcol2im<T>(queue_cpp, nullptr);
+    routine.DoCol2im(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                     Buffer<T>(col_buffer), col_offset,
+                     Buffer<T>(im_buffer), im_offset);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Col2im<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                             const CUdeviceptr, const size_t,
+                                             CUdeviceptr, const size_t,
+                                             const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Col2im<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                              const CUdeviceptr, const size_t,
+                                              CUdeviceptr, const size_t,
+                                              const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Col2im<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                              const CUdeviceptr, const size_t,
+                                              CUdeviceptr, const size_t,
+                                              const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Col2im<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                               const CUdeviceptr, const size_t,
+                                               CUdeviceptr, const size_t,
+                                               const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Col2im<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+                                            const CUdeviceptr, const size_t,
+                                            CUdeviceptr, const size_t,
+                                            const CUcontext, const CUdevice);
+
 // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
 template <typename T>
 StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
--- a/src/clblast_netlib_c.cpp
+++ b/src/clblast_netlib_c.cpp
@ -4967,4 +4967,94 @@ void cblas_zim2col(const int channels, const int height, const int width, const
  col_buffer.Read(queue, col_size, reinterpret_cast<double2*>(col));
 }

+// COL2IM
+void cblas_scol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                   const float* col,
+                   float* im) {
+  OPTIONAL_STATIC auto device = get_device();
+  OPTIONAL_STATIC auto context = clblast::Context(device);
+  auto queue = clblast::Queue(context, device);
+  const auto col_size = height * width * channels;
+  const auto im_size = height * width * channels;
+  auto col_buffer = clblast::Buffer<float>(context, col_size);
+  auto im_buffer = clblast::Buffer<float>(context, im_size);
+  col_buffer.Write(queue, col_size, reinterpret_cast<const float*>(col));
+  im_buffer.Write(queue, im_size, reinterpret_cast<float*>(im));
+  auto queue_cl = queue();
+  auto s = clblast::Col2im<float>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                  col_buffer(), 0,
+                                  im_buffer(), 0,
+                                  &queue_cl);
+  if (s != clblast::StatusCode::kSuccess) {
+    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+  }
+  im_buffer.Read(queue, im_size, reinterpret_cast<float*>(im));
+}
+void cblas_dcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                   const double* col,
+                   double* im) {
+  OPTIONAL_STATIC auto device = get_device();
+  OPTIONAL_STATIC auto context = clblast::Context(device);
+  auto queue = clblast::Queue(context, device);
+  const auto col_size = height * width * channels;
+  const auto im_size = height * width * channels;
+  auto col_buffer = clblast::Buffer<double>(context, col_size);
+  auto im_buffer = clblast::Buffer<double>(context, im_size);
+  col_buffer.Write(queue, col_size, reinterpret_cast<const double*>(col));
+  im_buffer.Write(queue, im_size, reinterpret_cast<double*>(im));
+  auto queue_cl = queue();
+  auto s = clblast::Col2im<double>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                   col_buffer(), 0,
+                                   im_buffer(), 0,
+                                   &queue_cl);
+  if (s != clblast::StatusCode::kSuccess) {
+    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+  }
+  im_buffer.Read(queue, im_size, reinterpret_cast<double*>(im));
+}
+void cblas_ccol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                   const void* col,
+                   void* im) {
+  OPTIONAL_STATIC auto device = get_device();
+  OPTIONAL_STATIC auto context = clblast::Context(device);
+  auto queue = clblast::Queue(context, device);
+  const auto col_size = height * width * channels;
+  const auto im_size = height * width * channels;
+  auto col_buffer = clblast::Buffer<float2>(context, col_size);
+  auto im_buffer = clblast::Buffer<float2>(context, im_size);
+  col_buffer.Write(queue, col_size, reinterpret_cast<const float2*>(col));
+  im_buffer.Write(queue, im_size, reinterpret_cast<float2*>(im));
+  auto queue_cl = queue();
+  auto s = clblast::Col2im<float2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                   col_buffer(), 0,
+                                   im_buffer(), 0,
+                                   &queue_cl);
+  if (s != clblast::StatusCode::kSuccess) {
+    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+  }
+  im_buffer.Read(queue, im_size, reinterpret_cast<float2*>(im));
+}
+void cblas_zcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                   const void* col,
+                   void* im) {
+  OPTIONAL_STATIC auto device = get_device();
+  OPTIONAL_STATIC auto context = clblast::Context(device);
+  auto queue = clblast::Queue(context, device);
+  const auto col_size = height * width * channels;
+  const auto im_size = height * width * channels;
+  auto col_buffer = clblast::Buffer<double2>(context, col_size);
+  auto im_buffer = clblast::Buffer<double2>(context, im_size);
+  col_buffer.Write(queue, col_size, reinterpret_cast<const double2*>(col));
+  im_buffer.Write(queue, im_size, reinterpret_cast<double2*>(im));
+  auto queue_cl = queue();
+  auto s = clblast::Col2im<double2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                                    col_buffer(), 0,
+                                    im_buffer(), 0,
+                                    &queue_cl);
+  if (s != clblast::StatusCode::kSuccess) {
+    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
+  }
+  im_buffer.Read(queue, im_size, reinterpret_cast<double2*>(im));
+}
+
 // =================================================================================================
--- a/src/kernels/levelx/col2im.opencl
+++ b/src/kernels/levelx/col2im.opencl
@ -0,0 +1,74 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// This file contains the col2im kernel, taken from:
+// https://gist.github.com/vbkaisetsu/a98299df827f9a5245635f646c1d94be
+// Credits go to https://github.com/vbkaisetsu
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// Work-group size parameters re-used from the 'copy' kernel
+#ifndef COPY_DIMX
+  #define COPY_DIMX 8      // Local workgroup size in the first dimension (w)
+#endif
+#ifndef COPY_DIMY
+  #define COPY_DIMY 8      // Local workgroup size in the second dimension (h)
+#endif
+
+// =================================================================================================
+
+__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+void col2im(const int input_h, const int input_w, const int channels,
+            const int output_h, const int output_w,
+            const int kernel_h, const int kernel_w,
+            const int pad_h, const int pad_w,
+            const int stride_h, const int stride_w,
+            const int dilation_h, const int dilation_w,
+            const __global real* restrict col_buffer, const int col_offset,
+            __global real *im_buffer, const int im_offset) {
+  const int x_x = get_global_id(0) + pad_w;
+  const int x_y = ((int) get_global_id(1)) % input_h + pad_h;
+  const int channel = ((int) get_global_id(1)) / input_h;
+  const int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+  const int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+  const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset;
+  const int x_channel_shift = channel * input_h * input_w + im_offset;
+  const int t_y_begin = (x_y < kernel_extent_h) ? 0 : (x_y - kernel_extent_h) / stride_h + 1;
+  const int t_y_end = min(x_y / stride_h + 1, output_h);
+  const int t_x_begin = (x_x < kernel_extent_w) ? 0 : (x_x - kernel_extent_w) / stride_w + 1;
+  const int t_x_end = min(x_x / stride_w + 1, output_w);
+
+  if (x_x < input_w + pad_w && channel < channels) {
+    real val;
+    SetToZero(val);
+    for (int t_y = t_y_begin; t_y < t_y_end; ++t_y) {
+      for (int t_x = t_x_begin; t_x < t_x_end; ++t_x) {
+        int w_y = x_y - t_y * stride_h;
+        int w_x = x_x - t_x * stride_w;
+        if (w_y % dilation_h == 0 && w_x % dilation_w == 0) {
+          w_y /= dilation_h;
+          w_x /= dilation_w;
+          val += col_buffer[col_channel_shift
+                            + (w_x + w_y * kernel_w) * output_h * output_w
+                            + t_y * output_w
+                            + t_x];
+        }
+      }
+    }
+    im_buffer[x_channel_shift + (x_y - pad_h) * input_w + x_x - pad_w] = val;
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
--- a/src/routines/levelx/xcol2im.cpp
+++ b/src/routines/levelx/xcol2im.cpp
@ -0,0 +1,92 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xcol2im class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/levelx/xcol2im.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xcol2im<T>::Xcol2im(Queue &queue, EventPointer event, const std::string &name):
+    Routine(queue, event, name, {"Copy"}, PrecisionValue<T>(), {}, {
+#include "../../kernels/levelx/col2im.opencl"
+    }) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xcol2im<T>::DoCol2im(const size_t channels, const size_t height, const size_t width,
+                          const size_t kernel_h, const size_t kernel_w, const size_t pad_h,
+                          const size_t pad_w, const size_t stride_h, const size_t stride_w,
+                          const size_t dilation_h, const size_t dilation_w,
+                          const Buffer<T> &col_buffer, const size_t col_offset,
+                          const Buffer<T> &im_buffer, const size_t im_offset) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((channels == 0) || (height == 0) || (width == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Sets the output height and width
+  const auto size_h = height + 2 * pad_h;
+  const auto padding_h = dilation_h * (kernel_h - 1) + 1;
+  const auto col_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
+  const auto size_w = width + 2 * pad_w;
+  const auto padding_w = dilation_w * (kernel_w - 1) + 1;
+  const auto col_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;
+
+  // Retrieves the kernel from the compiled binary
+  auto kernel = Kernel(program_, "col2im");
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(height));
+  kernel.SetArgument(1, static_cast<int>(width));
+  kernel.SetArgument(2, static_cast<int>(channels));
+  kernel.SetArgument(3, static_cast<int>(col_h));
+  kernel.SetArgument(4, static_cast<int>(col_w));
+  kernel.SetArgument(5, static_cast<int>(kernel_h));
+  kernel.SetArgument(6, static_cast<int>(kernel_w));
+  kernel.SetArgument(7, static_cast<int>(pad_h));
+  kernel.SetArgument(8, static_cast<int>(pad_w));
+  kernel.SetArgument(9, static_cast<int>(stride_h));
+  kernel.SetArgument(10, static_cast<int>(stride_w));
+  kernel.SetArgument(11, static_cast<int>(dilation_h));
+  kernel.SetArgument(12, static_cast<int>(dilation_w));
+  kernel.SetArgument(13, col_buffer());
+  kernel.SetArgument(14, static_cast<int>(col_offset));
+  kernel.SetArgument(15, im_buffer());
+  kernel.SetArgument(16, static_cast<int>(im_offset));
+
+  // Launches the kernel
+  const auto w_ceiled = Ceil(col_w, db_["COPY_DIMX"]);
+  const auto h_ceiled = Ceil(col_h, db_["COPY_DIMY"]);
+  const auto global = std::vector<size_t>{w_ceiled, h_ceiled * channels};
+  const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
+  RunKernel(kernel, queue_, device_, global, local, event_);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xcol2im<half>;
+template class Xcol2im<float>;
+template class Xcol2im<double>;
+template class Xcol2im<float2>;
+template class Xcol2im<double2>;
+
+// =================================================================================================
+} // namespace clblast
--- a/src/routines/levelx/xcol2im.hpp
+++ b/src/routines/levelx/xcol2im.hpp
@ -0,0 +1,45 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xcol2im routine. The precision is implemented using a template argument.
+// Uses the tuning parameters from the regular copy kernel.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XCOL2IM_H_
+#define CLBLAST_ROUTINES_XCOL2IM_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xcol2im: public Routine {
+ public:
+
+  // Constructor
+  Xcol2im(Queue &queue, EventPointer event, const std::string &name = "COL2IM");
+
+  // Templated-precision implementation of the routine
+  void DoCol2im(const size_t channels, const size_t height, const size_t width,
+                const size_t kernel_h, const size_t kernel_w,
+                const size_t pad_h, const size_t pad_w,
+                const size_t stride_h, const size_t stride_w,
+                const size_t dilation_h, const size_t dilation_w,
+                const Buffer<T> &col_buffer, const size_t col_offset,
+                const Buffer<T> &im_buffer, const size_t im_offset);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XCOL2IM_H_
+#endif
--- a/src/routines/routines.hpp
+++ b/src/routines/routines.hpp
@ -70,6 +70,7 @@
 #include "routines/levelx/xhad.hpp"
 #include "routines/levelx/xomatcopy.hpp"
 #include "routines/levelx/xim2col.hpp"
+#include "routines/levelx/xcol2im.hpp"
 #include "routines/levelx/xconvgemm.hpp"
 #include "routines/levelx/xaxpybatched.hpp"
 #include "routines/levelx/xgemmbatched.hpp"
--- a/test/correctness/routines/levelx/xcol2im.cpp
+++ b/test/correctness/routines/levelx/xcol2im.cpp
@ -0,0 +1,26 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/levelx/xcol2im.hpp"
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXcol2im<float>, float, float>(argc, argv, false, "SCOL2IM");
+  errors += clblast::RunTests<clblast::TestXcol2im<double>, double, double>(argc, argv, true, "DCOL2IM");
+  errors += clblast::RunTests<clblast::TestXcol2im<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CCOL2IM");
+  errors += clblast::RunTests<clblast::TestXcol2im<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZCOL2IM");
+  errors += clblast::RunTests<clblast::TestXcol2im<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HCOL2IM");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
--- a/test/performance/routines/levelx/xcol2im.cpp
+++ b/test/performance/routines/levelx/xcol2im.cpp
@ -0,0 +1,33 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/performance/client.hpp"
+#include "test/routines/levelx/xcol2im.hpp"
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
+  switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXcol2im<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXcol2im<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXcol2im<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXcol2im<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXcol2im<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
--- a/test/routines/levelx/xcol2im.hpp
+++ b/test/routines/levelx/xcol2im.hpp
@ -0,0 +1,195 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xcol2im routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XCOL2IM_H_
+#define CLBLAST_TEST_ROUTINES_XCOL2IM_H_
+
+#include "test/routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXcol2im {
+public:
+
+  // The BLAS level: 4 for the extra routines
+  static size_t BLASLevel() { return 4; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgChannels, kArgHeight, kArgWidth, kArgKernelH, kArgKernelW, kArgPadH, kArgPadW,
+            kArgStrideH, kArgStrideW, kArgDilationH, kArgDilationW,
+            kArgAOffset, kArgBOffset};
+  }
+  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; } // b = col
+  static std::vector<std::string> BuffersOut() { return {kBufMatA}; } // a = im
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t ColHeight(const Arguments<T> &args) {
+    const auto size = args.height + 2 * args.pad_h;
+    const auto padding = args.dilation_h * (args.kernel_h - 1) + 1;
+    if (size >= padding) { return (size - padding) / args.stride_h + 1; }
+    return 1;
+  }
+  static size_t ColWidth(const Arguments<T> &args) {
+    const auto size = args.width + 2 * args.pad_w;
+    const auto padding = args.dilation_w * (args.kernel_w - 1) + 1;
+    if (size >= padding) { return (size - padding) / args.stride_w + 1; }
+    return 1;
+  }
+  static size_t NumPatches(const Arguments<T> &args) {
+    return ColHeight(args) * ColWidth(args) * args.channels;
+  }
+  static size_t GetSizeA(const Arguments<T> &args) {
+    return args.height * args.width * args.channels + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    return args.kernel_w * args.kernel_h * NumPatches(args) + args.b_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args, Queue&) {
+    args.a_size = GetSizeA(args); // im
+    args.b_size = GetSizeB(args); // col
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which transpose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to prepare the input data
+  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
+                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    #ifdef OPENCL_API
+      auto queue_plain = queue();
+      auto event = cl_event{};
+      auto status = Col2im<T>(args.channels, args.height, args.width,
+                              args.kernel_h, args.kernel_w,
+                              args.pad_h, args.pad_w,
+                              args.stride_h, args.stride_w,
+                              args.dilation_h, args.dilation_w,
+                              buffers.b_mat(), args.b_offset, // col
+                              buffers.a_mat(), args.a_offset, // im
+                              &queue_plain, &event);
+      if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
+    #elif CUDA_API
+      auto status = Col2im<T>(args.channels, args.height, args.width,
+                              args.kernel_h, args.kernel_w,
+                              args.pad_h, args.pad_w,
+                              args.stride_h, args.stride_w,
+                              args.dilation_h, args.dilation_w,
+                              buffers.b_mat(), args.b_offset, // col
+                              buffers.a_mat(), args.a_offset, // im
+                              queue.GetContext()(), queue.GetDevice()());
+      cuStreamSynchronize(queue());
+    #endif
+    return status;
+  }
+
+  // Describes how to run a naive version of the routine (for correctness/performance comparison).
+  // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
+  static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto buffers_host = BuffersHost<T>();
+    DeviceToHost(args, buffers, buffers_host, queue, BuffersIn());
+    const auto status = RunReference(args, buffers_host);
+    HostToDevice(args, buffers, buffers_host, queue, BuffersOut());
+    return status;
+  }
+
+  static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) {
+    return RunReference(args, buffers_host);
+  }
+  static StatusCode RunReference3(const Arguments<T> &, BuffersCUDA<T> &, Queue &) {
+    return StatusCode::kUnknownError;
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.a_size, static_cast<T>(0));
+    buffers.a_mat.Read(queue, args.a_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.height * args.width; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.channels; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    return id1 + args.height * args.width * id2 + args.a_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &) {
+    return 1;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    const auto im = args.channels * args.width * args.height; // possibly less with striding
+    const auto col = args.kernel_h * args.kernel_w * NumPatches(args);
+    return (im + col) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+
+template <typename T>
+StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
+  // Reference taken from im2col but swapped the input/output
+  const auto col_h = TestXcol2im<T>::ColHeight(args);
+  const auto col_w = TestXcol2im<T>::ColWidth(args);
+  for (auto c_id = size_t{0}; c_id < args.channels; ++c_id) { // image channels
+    for (auto kh_id = size_t{0}; kh_id < args.kernel_h; ++kh_id) { // kernel height
+      for (auto kw_id = size_t{0}; kw_id < args.kernel_w; ++kw_id) { // kernel width
+        for (auto h_id = size_t{0}; h_id < col_h; ++h_id) { // image height
+          for (auto w_id = size_t{0}; w_id < col_w; ++w_id) { // image width
+
+            // Reads the input value
+            const auto kernel_index = kw_id + args.kernel_w * kh_id;
+            const auto patch_index = w_id + col_w * h_id;
+            const auto col_index = patch_index + kernel_index * col_w * col_h +
+                                   c_id * col_w * col_h * args.kernel_h * args.kernel_w;
+            const auto val = buffers_host.b_mat[col_index + args.b_offset];
+
+            // Sets the output value
+            const auto h_index = kh_id * args.dilation_h + args.stride_h * h_id - args.pad_h;
+            const auto w_index = kw_id * args.dilation_w + args.stride_w * w_id - args.pad_w;
+            if (h_index >= 0 && h_index < args.height &&
+                w_index >= 0 && w_index < args.width) {
+              const auto im_index = w_index + args.width * (h_index + args.height * c_id);
+              buffers_host.a_mat[im_index + args.a_offset] = val;
+            }
+          }
+        }
+      }
+    }
+  }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XCOL2IM_H_
+#endif