Added groundwork for col2im algorithm plus first non-working version of kernel and test

2018-10-23 20:52:25 +02:00 · 2018-10-23 20:52:25 +02:00 · d45911b61d
parent 44b630fc22
commit d45911b61d
19 changed files with 814 additions and 4 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -221,7 +221,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
                    xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
-set(LEVELX_ROUTINES xhad xomatcopy xim2col xconvgemm xaxpybatched xgemmbatched xgemmstridedbatched)
+set(LEVELX_ROUTINES xhad xomatcopy xim2col xcol2im xconvgemm xaxpybatched xgemmbatched xgemmstridedbatched)
 set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464 16)
--- a/doc/api.md
+++ b/doc/api.md
@ -3072,6 +3072,66 @@ Arguments to IM2COL:
 xCOL2IM: Col2im function (non-BLAS function)
 -------------
 Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix.
 C++ API:
 ```
 template <typename T>
 StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                  const cl_mem col_buffer, const size_t col_offset,
                  cl_mem im_buffer, const size_t im_offset,
                  cl_command_queue* queue, cl_event* event)
 ```
 C API:
 ```
 CLBlastStatusCode CLBlastScol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                 const cl_mem col_buffer, const size_t col_offset,
                                 cl_mem im_buffer, const size_t im_offset,
                                 cl_command_queue* queue, cl_event* event)
 CLBlastStatusCode CLBlastDcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                 const cl_mem col_buffer, const size_t col_offset,
                                 cl_mem im_buffer, const size_t im_offset,
                                 cl_command_queue* queue, cl_event* event)
 CLBlastStatusCode CLBlastCcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                 const cl_mem col_buffer, const size_t col_offset,
                                 cl_mem im_buffer, const size_t im_offset,
                                 cl_command_queue* queue, cl_event* event)
 CLBlastStatusCode CLBlastZcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                 const cl_mem col_buffer, const size_t col_offset,
                                 cl_mem im_buffer, const size_t im_offset,
                                 cl_command_queue* queue, cl_event* event)
 CLBlastStatusCode CLBlastHcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                 const cl_mem col_buffer, const size_t col_offset,
                                 cl_mem im_buffer, const size_t im_offset,
                                 cl_command_queue* queue, cl_event* event)
 ```
 Arguments to COL2IM:
 * `const size_t channels`: Integer size argument. This value must be positive.
 * `const size_t height`: Integer size argument. This value must be positive.
 * `const size_t width`: Integer size argument. This value must be positive.
 * `const size_t kernel_h`: Integer size argument. This value must be positive.
 * `const size_t kernel_w`: Integer size argument. This value must be positive.
 * `const size_t pad_h`: Integer size argument. This value must be positive.
 * `const size_t pad_w`: Integer size argument. This value must be positive.
 * `const size_t stride_h`: Integer size argument. This value must be positive.
 * `const size_t stride_w`: Integer size argument. This value must be positive.
 * `const size_t dilation_h`: Integer size argument. This value must be positive.
 * `const size_t dilation_w`: Integer size argument. This value must be positive.
 * `const cl_mem col_buffer`: OpenCL buffer to store the input col tensor.
 * `const size_t col_offset`: The offset in elements from the start of the input col tensor.
 * `cl_mem im_buffer`: OpenCL buffer to store the output im tensor.
 * `const size_t im_offset`: The offset in elements from the start of the output im tensor.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 xCONVGEMM: Batched convolution as GEMM (non-BLAS function)
 -------------
--- a/include/clblast.h
+++ b/include/clblast.h
@ -636,6 +636,13 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width
                  cl_mem col_buffer, const size_t col_offset,
                  cl_command_queue* queue, cl_event* event = nullptr);
 // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
 template <typename T>
 StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                  const cl_mem col_buffer, const size_t col_offset,
                  cl_mem im_buffer, const size_t im_offset,
                  cl_command_queue* queue, cl_event* event = nullptr);
 // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
 template <typename T>
 StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@ -1410,6 +1410,28 @@ CLBlastStatusCode PUBLIC_API CLBlastHim2col(const size_t channels, const size_t
                                            cl_mem col_buffer, const size_t col_offset,
                                            cl_command_queue* queue, cl_event* event);
 // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
 CLBlastStatusCode PUBLIC_API CLBlastScol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem col_buffer, const size_t col_offset,
                                            cl_mem im_buffer, const size_t im_offset,
                                            cl_command_queue* queue, cl_event* event);
 CLBlastStatusCode PUBLIC_API CLBlastDcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem col_buffer, const size_t col_offset,
                                            cl_mem im_buffer, const size_t im_offset,
                                            cl_command_queue* queue, cl_event* event);
 CLBlastStatusCode PUBLIC_API CLBlastCcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem col_buffer, const size_t col_offset,
                                            cl_mem im_buffer, const size_t im_offset,
                                            cl_command_queue* queue, cl_event* event);
 CLBlastStatusCode PUBLIC_API CLBlastZcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem col_buffer, const size_t col_offset,
                                            cl_mem im_buffer, const size_t im_offset,
                                            cl_command_queue* queue, cl_event* event);
 CLBlastStatusCode PUBLIC_API CLBlastHcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                            const cl_mem col_buffer, const size_t col_offset,
                                            cl_mem im_buffer, const size_t im_offset,
                                            cl_command_queue* queue, cl_event* event);
 // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
 CLBlastStatusCode PUBLIC_API CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
                                              const cl_mem im_buffer, const size_t im_offset,
--- a/include/clblast_cuda.h
+++ b/include/clblast_cuda.h
@ -608,6 +608,13 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width
                  CUdeviceptr col_buffer, const size_t col_offset,
                  const CUcontext context, const CUdevice device);
 // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
 template <typename T>
 StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                  const CUdeviceptr col_buffer, const size_t col_offset,
                  CUdeviceptr im_buffer, const size_t im_offset,
                  const CUcontext context, const CUdevice device);
 // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
 template <typename T>
 StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
--- a/include/clblast_netlib_c.h
+++ b/include/clblast_netlib_c.h
@ -960,6 +960,20 @@ void PUBLIC_API cblas_zim2col(const int channels, const int height, const int wi
                              const void* im,
                              void* col);
 // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
 void PUBLIC_API cblas_scol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
                              const float* col,
                              float* im);
 void PUBLIC_API cblas_dcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
                              const double* col,
                              double* im);
 void PUBLIC_API cblas_ccol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
                              const void* col,
                              void* im);
 void PUBLIC_API cblas_zcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
                              const void* col,
                              void* im);
 // =================================================================================================
 #ifdef __cplusplus
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@ -181,6 +181,7 @@ ROUTINES = [
  Routine(True,  True,  0, False, "x", "had",      T, [S,D,C,Z,H],   ["n"],                [],                                                    ["x","y"],  ["z"],                        [xn,yn,zn],      ["alpha","beta"], "",    "Element-wise vector product (Hadamard)", "Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants.", []),
  Routine(True,  True,  0, False, "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
  Routine(True,  True,  0, False, "x", "im2col",   T, [S,D,C,Z,H],   im2col_constants,     [],                                                    ["im"],     ["col"],                      [im,col],        [""],             "",    "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.", []),
  Routine(True,  True,  0, False, "x", "col2im",   T, [S,D,C,Z,H],   im2col_constants,     [],                                                    ["col"],    ["im"],                       [col,im],        [""],             "",    "Col2im function (non-BLAS function)", "Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix.", []),
  Routine(True,  True,  0, False, "x", "convgemm", T, [S,D,H],       convgemm_constants,   [],                                                    ["im","kernel"], ["result"],              [imb,kernel,result],[""],          "",    "Batched convolution as GEMM (non-BLAS function)", "Integrates im2col and GEMM for batched 3D convolution, in which _im_ is the 4D input tensor (NCHW - batch-channelin-height-width), _kernel_ the 4D kernel weights tensor (KCHW - channelout-channelin-height-width), and _result_ the 4D output tensor (NCHW - batch-channelout-height-width).", []),
  # Batched routines:
  Routine(True,  True,  1, False, "x", "axpy",     T, [S,D,C,Z,H],   ["n"],                [],                                                    ["x"],      ["y"],                        [xn,yn],         ["alpha"],        "",    "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []),
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@ -205,7 +205,7 @@ class Routine:
    def no_scalars(self):
        """Determines whether or not this routine has scalar arguments (alpha/beta)"""
-        return self.scalars == [] or self.name in ["im2col", "convgemm"]
+        return self.scalars == [] or self.name in ["im2col", "col2im", "convgemm"]
    def has_layout(self):
        """Determines whether the layout is an argument"""
@ -226,12 +226,14 @@ class Routine:
        """Determines which buffers go first (between alpha and beta) and which ones go after"""
        if self.level == "2b" or self.name == "had":
            return ["x", "y"]
-        return ["ap", "a", "b", "x", "im", "kernel"]
+        extra_buffer = "col" if self.name == "col2im" else "im"
        return ["ap", "a", "b", "x", extra_buffer, "kernel"]
    def buffers_second(self):
        if self.level == "2b" or self.name == "had":
            return ["z", "ap", "a", "b", "c"]
-        return ["y", "c", "col", "result"]
+        extra_buffer = "im" if self.name == "col2im" else "col"
        return ["y", "c", extra_buffer, "result"]
    def buffer(self, name):
        """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@ -2252,6 +2252,42 @@ template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const si
                                            cl_mem, const size_t,
                                            cl_command_queue*, cl_event*);
 // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
 template <typename T>
 StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                  const cl_mem col_buffer, const size_t col_offset,
                  cl_mem im_buffer, const size_t im_offset,
                  cl_command_queue* queue, cl_event* event) {
  try {
    auto queue_cpp = Queue(*queue);
    auto routine = Xcol2im<T>(queue_cpp, event);
    routine.DoCol2im(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                     Buffer<T>(col_buffer), col_offset,
                     Buffer<T>(im_buffer), im_offset);
    return StatusCode::kSuccess;
  } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API Col2im<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
                                             const cl_mem, const size_t,
                                             cl_mem, const size_t,
                                             cl_command_queue*, cl_event*);
 template StatusCode PUBLIC_API Col2im<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
                                              const cl_mem, const size_t,
                                              cl_mem, const size_t,
                                              cl_command_queue*, cl_event*);
 template StatusCode PUBLIC_API Col2im<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
                                              const cl_mem, const size_t,
                                              cl_mem, const size_t,
                                              cl_command_queue*, cl_event*);
 template StatusCode PUBLIC_API Col2im<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
                                               const cl_mem, const size_t,
                                               cl_mem, const size_t,
                                               cl_command_queue*, cl_event*);
 template StatusCode PUBLIC_API Col2im<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
                                            const cl_mem, const size_t,
                                            cl_mem, const size_t,
                                            cl_command_queue*, cl_event*);
 // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
 template <typename T>
 StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
--- a/src/clblast_c.cpp
+++ b/src/clblast_c.cpp
@ -3679,6 +3679,73 @@ CLBlastStatusCode CLBlastHim2col(const size_t channels, const size_t height, con
  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
 // COL2IM
 CLBlastStatusCode CLBlastScol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                 const cl_mem col_buffer, const size_t col_offset,
                                 cl_mem im_buffer, const size_t im_offset,
                                 cl_command_queue* queue, cl_event* event) {
  try {
    return static_cast<CLBlastStatusCode>(
      clblast::Col2im<float>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                             col_buffer, col_offset,
                             im_buffer, im_offset,
                             queue, event)
    );
  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
 CLBlastStatusCode CLBlastDcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                 const cl_mem col_buffer, const size_t col_offset,
                                 cl_mem im_buffer, const size_t im_offset,
                                 cl_command_queue* queue, cl_event* event) {
  try {
    return static_cast<CLBlastStatusCode>(
      clblast::Col2im<double>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                              col_buffer, col_offset,
                              im_buffer, im_offset,
                              queue, event)
    );
  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
 CLBlastStatusCode CLBlastCcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                 const cl_mem col_buffer, const size_t col_offset,
                                 cl_mem im_buffer, const size_t im_offset,
                                 cl_command_queue* queue, cl_event* event) {
  try {
    return static_cast<CLBlastStatusCode>(
      clblast::Col2im<float2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                              col_buffer, col_offset,
                              im_buffer, im_offset,
                              queue, event)
    );
  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
 CLBlastStatusCode CLBlastZcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                 const cl_mem col_buffer, const size_t col_offset,
                                 cl_mem im_buffer, const size_t im_offset,
                                 cl_command_queue* queue, cl_event* event) {
  try {
    return static_cast<CLBlastStatusCode>(
      clblast::Col2im<double2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                               col_buffer, col_offset,
                               im_buffer, im_offset,
                               queue, event)
    );
  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
 CLBlastStatusCode CLBlastHcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                                 const cl_mem col_buffer, const size_t col_offset,
                                 cl_mem im_buffer, const size_t im_offset,
                                 cl_command_queue* queue, cl_event* event) {
  try {
    return static_cast<CLBlastStatusCode>(
      clblast::Col2im<half>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                            col_buffer, col_offset,
                            im_buffer, im_offset,
                            queue, event)
    );
  } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
 }
 // CONVGEMM
 CLBlastStatusCode CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
                                   const cl_mem im_buffer, const size_t im_offset,
--- a/src/clblast_cuda.cpp
+++ b/src/clblast_cuda.cpp
@ -2350,6 +2350,44 @@ template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const si
                                            CUdeviceptr, const size_t,
                                            const CUcontext, const CUdevice);
 // Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
 template <typename T>
 StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
                  const CUdeviceptr col_buffer, const size_t col_offset,
                  CUdeviceptr im_buffer, const size_t im_offset,
                  const CUcontext context, const CUdevice device) {
  try {
    const auto context_cpp = Context(context);
    const auto device_cpp = Device(device);
    auto queue_cpp = Queue(context_cpp, device_cpp);
    auto routine = Xcol2im<T>(queue_cpp, nullptr);
    routine.DoCol2im(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                     Buffer<T>(col_buffer), col_offset,
                     Buffer<T>(im_buffer), im_offset);
    return StatusCode::kSuccess;
  } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API Col2im<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
                                             const CUdeviceptr, const size_t,
                                             CUdeviceptr, const size_t,
                                             const CUcontext, const CUdevice);
 template StatusCode PUBLIC_API Col2im<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
                                              const CUdeviceptr, const size_t,
                                              CUdeviceptr, const size_t,
                                              const CUcontext, const CUdevice);
 template StatusCode PUBLIC_API Col2im<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
                                              const CUdeviceptr, const size_t,
                                              CUdeviceptr, const size_t,
                                              const CUcontext, const CUdevice);
 template StatusCode PUBLIC_API Col2im<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
                                               const CUdeviceptr, const size_t,
                                               CUdeviceptr, const size_t,
                                               const CUcontext, const CUdevice);
 template StatusCode PUBLIC_API Col2im<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
                                            const CUdeviceptr, const size_t,
                                            CUdeviceptr, const size_t,
                                            const CUcontext, const CUdevice);
 // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
 template <typename T>
 StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
--- a/src/clblast_netlib_c.cpp
+++ b/src/clblast_netlib_c.cpp
@ -4967,4 +4967,94 @@ void cblas_zim2col(const int channels, const int height, const int width, const
  col_buffer.Read(queue, col_size, reinterpret_cast<double2*>(col));
 }
 // COL2IM
 void cblas_scol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
                   const float* col,
                   float* im) {
  OPTIONAL_STATIC auto device = get_device();
  OPTIONAL_STATIC auto context = clblast::Context(device);
  auto queue = clblast::Queue(context, device);
  const auto col_size = height * width * channels;
  const auto im_size = height * width * channels;
  auto col_buffer = clblast::Buffer<float>(context, col_size);
  auto im_buffer = clblast::Buffer<float>(context, im_size);
  col_buffer.Write(queue, col_size, reinterpret_cast<const float*>(col));
  im_buffer.Write(queue, im_size, reinterpret_cast<float*>(im));
  auto queue_cl = queue();
  auto s = clblast::Col2im<float>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                                  col_buffer(), 0,
                                  im_buffer(), 0,
                                  &queue_cl);
  if (s != clblast::StatusCode::kSuccess) {
    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
  }
  im_buffer.Read(queue, im_size, reinterpret_cast<float*>(im));
 }
 void cblas_dcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
                   const double* col,
                   double* im) {
  OPTIONAL_STATIC auto device = get_device();
  OPTIONAL_STATIC auto context = clblast::Context(device);
  auto queue = clblast::Queue(context, device);
  const auto col_size = height * width * channels;
  const auto im_size = height * width * channels;
  auto col_buffer = clblast::Buffer<double>(context, col_size);
  auto im_buffer = clblast::Buffer<double>(context, im_size);
  col_buffer.Write(queue, col_size, reinterpret_cast<const double*>(col));
  im_buffer.Write(queue, im_size, reinterpret_cast<double*>(im));
  auto queue_cl = queue();
  auto s = clblast::Col2im<double>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                                   col_buffer(), 0,
                                   im_buffer(), 0,
                                   &queue_cl);
  if (s != clblast::StatusCode::kSuccess) {
    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
  }
  im_buffer.Read(queue, im_size, reinterpret_cast<double*>(im));
 }
 void cblas_ccol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
                   const void* col,
                   void* im) {
  OPTIONAL_STATIC auto device = get_device();
  OPTIONAL_STATIC auto context = clblast::Context(device);
  auto queue = clblast::Queue(context, device);
  const auto col_size = height * width * channels;
  const auto im_size = height * width * channels;
  auto col_buffer = clblast::Buffer<float2>(context, col_size);
  auto im_buffer = clblast::Buffer<float2>(context, im_size);
  col_buffer.Write(queue, col_size, reinterpret_cast<const float2*>(col));
  im_buffer.Write(queue, im_size, reinterpret_cast<float2*>(im));
  auto queue_cl = queue();
  auto s = clblast::Col2im<float2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                                   col_buffer(), 0,
                                   im_buffer(), 0,
                                   &queue_cl);
  if (s != clblast::StatusCode::kSuccess) {
    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
  }
  im_buffer.Read(queue, im_size, reinterpret_cast<float2*>(im));
 }
 void cblas_zcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
                   const void* col,
                   void* im) {
  OPTIONAL_STATIC auto device = get_device();
  OPTIONAL_STATIC auto context = clblast::Context(device);
  auto queue = clblast::Queue(context, device);
  const auto col_size = height * width * channels;
  const auto im_size = height * width * channels;
  auto col_buffer = clblast::Buffer<double2>(context, col_size);
  auto im_buffer = clblast::Buffer<double2>(context, im_size);
  col_buffer.Write(queue, col_size, reinterpret_cast<const double2*>(col));
  im_buffer.Write(queue, im_size, reinterpret_cast<double2*>(im));
  auto queue_cl = queue();
  auto s = clblast::Col2im<double2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                                    col_buffer(), 0,
                                    im_buffer(), 0,
                                    &queue_cl);
  if (s != clblast::StatusCode::kSuccess) {
    throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
  }
  im_buffer.Read(queue, im_size, reinterpret_cast<double2*>(im));
 }
 // =================================================================================================
--- a/src/kernels/levelx/col2im.opencl
+++ b/src/kernels/levelx/col2im.opencl
@ -0,0 +1,74 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // This file contains the col2im kernel, taken from:
 // https://gist.github.com/vbkaisetsu/a98299df827f9a5245635f646c1d94be
 // Credits go to https://github.com/vbkaisetsu
 //
 // =================================================================================================
 // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 // literal). Comment-out this line for syntax-highlighting when developing.
 R"(
 // Work-group size parameters re-used from the 'copy' kernel
 #ifndef COPY_DIMX
  #define COPY_DIMX 8      // Local workgroup size in the first dimension (w)
 #endif
 #ifndef COPY_DIMY
  #define COPY_DIMY 8      // Local workgroup size in the second dimension (h)
 #endif
 // =================================================================================================
 __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
 void col2im(const int input_h, const int input_w, const int channels,
            const int output_h, const int output_w,
            const int kernel_h, const int kernel_w,
            const int pad_h, const int pad_w,
            const int stride_h, const int stride_w,
            const int dilation_h, const int dilation_w,
            const __global real* restrict col_buffer, const int col_offset,
            __global real *im_buffer, const int im_offset) {
  const int x_x = get_global_id(0) + pad_w;
  const int x_y = ((int) get_global_id(1)) % input_h + pad_h;
  const int channel = ((int) get_global_id(1)) / input_h;
  const int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
  const int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
  const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset;
  const int x_channel_shift = channel * input_h * input_w + im_offset;
  const int t_y_begin = (x_y < kernel_extent_h) ? 0 : (x_y - kernel_extent_h) / stride_h + 1;
  const int t_y_end = min(x_y / stride_h + 1, output_h);
  const int t_x_begin = (x_x < kernel_extent_w) ? 0 : (x_x - kernel_extent_w) / stride_w + 1;
  const int t_x_end = min(x_x / stride_w + 1, output_w);
  if (x_x < input_w + pad_w && channel < channels) {
    real val;
    SetToZero(val);
    for (int t_y = t_y_begin; t_y < t_y_end; ++t_y) {
      for (int t_x = t_x_begin; t_x < t_x_end; ++t_x) {
        int w_y = x_y - t_y * stride_h;
        int w_x = x_x - t_x * stride_w;
        if (w_y % dilation_h == 0 && w_x % dilation_w == 0) {
          w_y /= dilation_h;
          w_x /= dilation_w;
          val += col_buffer[col_channel_shift
                            + (w_x + w_y * kernel_w) * output_h * output_w
                            + t_y * output_w
                            + t_x];
        }
      }
    }
    im_buffer[x_channel_shift + (x_y - pad_h) * input_w + x_x - pad_w] = val;
  }
 }
 // =================================================================================================
 // End of the C++11 raw string literal
 )"
 // =================================================================================================
--- a/src/routines/levelx/xcol2im.cpp
+++ b/src/routines/levelx/xcol2im.cpp
@ -0,0 +1,92 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xcol2im class (see the header for information about the class).
 //
 // =================================================================================================
 #include "routines/levelx/xcol2im.hpp"
 #include <string>
 #include <vector>
 namespace clblast {
 // =================================================================================================
 // Constructor: forwards to base class constructor
 template <typename T>
 Xcol2im<T>::Xcol2im(Queue &queue, EventPointer event, const std::string &name):
    Routine(queue, event, name, {"Copy"}, PrecisionValue<T>(), {}, {
 #include "../../kernels/levelx/col2im.opencl"
    }) {
 }
 // =================================================================================================
 // The main routine
 template <typename T>
 void Xcol2im<T>::DoCol2im(const size_t channels, const size_t height, const size_t width,
                          const size_t kernel_h, const size_t kernel_w, const size_t pad_h,
                          const size_t pad_w, const size_t stride_h, const size_t stride_w,
                          const size_t dilation_h, const size_t dilation_w,
                          const Buffer<T> &col_buffer, const size_t col_offset,
                          const Buffer<T> &im_buffer, const size_t im_offset) {
  // Makes sure all dimensions are larger than zero
  if ((channels == 0) || (height == 0) || (width == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
  // Sets the output height and width
  const auto size_h = height + 2 * pad_h;
  const auto padding_h = dilation_h * (kernel_h - 1) + 1;
  const auto col_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
  const auto size_w = width + 2 * pad_w;
  const auto padding_w = dilation_w * (kernel_w - 1) + 1;
  const auto col_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;
  // Retrieves the kernel from the compiled binary
  auto kernel = Kernel(program_, "col2im");
  // Sets the kernel arguments
  kernel.SetArgument(0, static_cast<int>(height));
  kernel.SetArgument(1, static_cast<int>(width));
  kernel.SetArgument(2, static_cast<int>(channels));
  kernel.SetArgument(3, static_cast<int>(col_h));
  kernel.SetArgument(4, static_cast<int>(col_w));
  kernel.SetArgument(5, static_cast<int>(kernel_h));
  kernel.SetArgument(6, static_cast<int>(kernel_w));
  kernel.SetArgument(7, static_cast<int>(pad_h));
  kernel.SetArgument(8, static_cast<int>(pad_w));
  kernel.SetArgument(9, static_cast<int>(stride_h));
  kernel.SetArgument(10, static_cast<int>(stride_w));
  kernel.SetArgument(11, static_cast<int>(dilation_h));
  kernel.SetArgument(12, static_cast<int>(dilation_w));
  kernel.SetArgument(13, col_buffer());
  kernel.SetArgument(14, static_cast<int>(col_offset));
  kernel.SetArgument(15, im_buffer());
  kernel.SetArgument(16, static_cast<int>(im_offset));
  // Launches the kernel
  const auto w_ceiled = Ceil(col_w, db_["COPY_DIMX"]);
  const auto h_ceiled = Ceil(col_h, db_["COPY_DIMY"]);
  const auto global = std::vector<size_t>{w_ceiled, h_ceiled * channels};
  const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
  RunKernel(kernel, queue_, device_, global, local, event_);
 }
 // =================================================================================================
 // Compiles the templated class
 template class Xcol2im<half>;
 template class Xcol2im<float>;
 template class Xcol2im<double>;
 template class Xcol2im<float2>;
 template class Xcol2im<double2>;
 // =================================================================================================
 } // namespace clblast
--- a/src/routines/levelx/xcol2im.hpp
+++ b/src/routines/levelx/xcol2im.hpp
@ -0,0 +1,45 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements the Xcol2im routine. The precision is implemented using a template argument.
 // Uses the tuning parameters from the regular copy kernel.
 //
 // =================================================================================================
 #ifndef CLBLAST_ROUTINES_XCOL2IM_H_
 #define CLBLAST_ROUTINES_XCOL2IM_H_
 #include "routine.hpp"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class Xcol2im: public Routine {
 public:
  // Constructor
  Xcol2im(Queue &queue, EventPointer event, const std::string &name = "COL2IM");
  // Templated-precision implementation of the routine
  void DoCol2im(const size_t channels, const size_t height, const size_t width,
                const size_t kernel_h, const size_t kernel_w,
                const size_t pad_h, const size_t pad_w,
                const size_t stride_h, const size_t stride_w,
                const size_t dilation_h, const size_t dilation_w,
                const Buffer<T> &col_buffer, const size_t col_offset,
                const Buffer<T> &im_buffer, const size_t im_offset);
 };
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_ROUTINES_XCOL2IM_H_
 #endif
--- a/src/routines/routines.hpp
+++ b/src/routines/routines.hpp
@ -70,6 +70,7 @@
 #include "routines/levelx/xhad.hpp"
 #include "routines/levelx/xomatcopy.hpp"
 #include "routines/levelx/xim2col.hpp"
 #include "routines/levelx/xcol2im.hpp"
 #include "routines/levelx/xconvgemm.hpp"
 #include "routines/levelx/xaxpybatched.hpp"
 #include "routines/levelx/xgemmbatched.hpp"
--- a/test/correctness/routines/levelx/xcol2im.cpp
+++ b/test/correctness/routines/levelx/xcol2im.cpp
@ -0,0 +1,26 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // =================================================================================================
 #include "test/correctness/testblas.hpp"
 #include "test/routines/levelx/xcol2im.hpp"
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  auto errors = size_t{0};
  errors += clblast::RunTests<clblast::TestXcol2im<float>, float, float>(argc, argv, false, "SCOL2IM");
  errors += clblast::RunTests<clblast::TestXcol2im<double>, double, double>(argc, argv, true, "DCOL2IM");
  errors += clblast::RunTests<clblast::TestXcol2im<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CCOL2IM");
  errors += clblast::RunTests<clblast::TestXcol2im<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZCOL2IM");
  errors += clblast::RunTests<clblast::TestXcol2im<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HCOL2IM");
  if (errors > 0) { return 1; } else { return 0; }
 }
 // =================================================================================================
--- a/test/performance/routines/levelx/xcol2im.cpp
+++ b/test/performance/routines/levelx/xcol2im.cpp
@ -0,0 +1,33 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // =================================================================================================
 #include "test/performance/client.hpp"
 #include "test/routines/levelx/xcol2im.hpp"
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
  const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
  switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
    case clblast::Precision::kHalf:
      clblast::RunClient<clblast::TestXcol2im<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
    case clblast::Precision::kSingle:
      clblast::RunClient<clblast::TestXcol2im<float>, float, float>(argc, argv); break;
    case clblast::Precision::kDouble:
      clblast::RunClient<clblast::TestXcol2im<double>, double, double>(argc, argv); break;
    case clblast::Precision::kComplexSingle:
      clblast::RunClient<clblast::TestXcol2im<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
    case clblast::Precision::kComplexDouble:
      clblast::RunClient<clblast::TestXcol2im<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
  }
  return 0;
 }
 // =================================================================================================
--- a/test/routines/levelx/xcol2im.hpp
+++ b/test/routines/levelx/xcol2im.hpp
@ -0,0 +1,195 @@
 // =================================================================================================
 // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 // width of 100 characters per line.
 //
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file implements a class with static methods to describe the Xcol2im routine. Examples of
 // such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
 // static methods are used by the correctness tester and the performance tester.
 //
 // =================================================================================================
 #ifndef CLBLAST_TEST_ROUTINES_XCOL2IM_H_
 #define CLBLAST_TEST_ROUTINES_XCOL2IM_H_
 #include "test/routines/common.hpp"
 namespace clblast {
 // =================================================================================================
 // See comment at top of file for a description of the class
 template <typename T>
 class TestXcol2im {
 public:
  // The BLAS level: 4 for the extra routines
  static size_t BLASLevel() { return 4; }
  // The list of arguments relevant for this routine
  static std::vector<std::string> GetOptions() {
    return {kArgChannels, kArgHeight, kArgWidth, kArgKernelH, kArgKernelW, kArgPadH, kArgPadW,
            kArgStrideH, kArgStrideW, kArgDilationH, kArgDilationW,
            kArgAOffset, kArgBOffset};
  }
  static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; } // b = col
  static std::vector<std::string> BuffersOut() { return {kBufMatA}; } // a = im
  // Describes how to obtain the sizes of the buffers
  static size_t ColHeight(const Arguments<T> &args) {
    const auto size = args.height + 2 * args.pad_h;
    const auto padding = args.dilation_h * (args.kernel_h - 1) + 1;
    if (size >= padding) { return (size - padding) / args.stride_h + 1; }
    return 1;
  }
  static size_t ColWidth(const Arguments<T> &args) {
    const auto size = args.width + 2 * args.pad_w;
    const auto padding = args.dilation_w * (args.kernel_w - 1) + 1;
    if (size >= padding) { return (size - padding) / args.stride_w + 1; }
    return 1;
  }
  static size_t NumPatches(const Arguments<T> &args) {
    return ColHeight(args) * ColWidth(args) * args.channels;
  }
  static size_t GetSizeA(const Arguments<T> &args) {
    return args.height * args.width * args.channels + args.a_offset;
  }
  static size_t GetSizeB(const Arguments<T> &args) {
    return args.kernel_w * args.kernel_h * NumPatches(args) + args.b_offset;
  }
  // Describes how to set the sizes of all the buffers
  static void SetSizes(Arguments<T> &args, Queue&) {
    args.a_size = GetSizeA(args); // im
    args.b_size = GetSizeB(args); // col
  }
  // Describes what the default values of the leading dimensions of the matrices are
  static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
  static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
  // Describes which transpose options are relevant for this routine
  using Transposes = std::vector<Transpose>;
  static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
  // Describes how to prepare the input data
  static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
                          std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
                          std::vector<T>&, std::vector<T>&) {} // N/A for this routine
  // Describes how to run the CLBlast routine
  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
    #ifdef OPENCL_API
      auto queue_plain = queue();
      auto event = cl_event{};
      auto status = Col2im<T>(args.channels, args.height, args.width,
                              args.kernel_h, args.kernel_w,
                              args.pad_h, args.pad_w,
                              args.stride_h, args.stride_w,
                              args.dilation_h, args.dilation_w,
                              buffers.b_mat(), args.b_offset, // col
                              buffers.a_mat(), args.a_offset, // im
                              &queue_plain, &event);
      if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
    #elif CUDA_API
      auto status = Col2im<T>(args.channels, args.height, args.width,
                              args.kernel_h, args.kernel_w,
                              args.pad_h, args.pad_w,
                              args.stride_h, args.stride_w,
                              args.dilation_h, args.dilation_w,
                              buffers.b_mat(), args.b_offset, // col
                              buffers.a_mat(), args.a_offset, // im
                              queue.GetContext()(), queue.GetDevice()());
      cuStreamSynchronize(queue());
    #endif
    return status;
  }
  // Describes how to run a naive version of the routine (for correctness/performance comparison).
  // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
  static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
    auto buffers_host = BuffersHost<T>();
    DeviceToHost(args, buffers, buffers_host, queue, BuffersIn());
    const auto status = RunReference(args, buffers_host);
    HostToDevice(args, buffers, buffers_host, queue, BuffersOut());
    return status;
  }
  static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) {
    return RunReference(args, buffers_host);
  }
  static StatusCode RunReference3(const Arguments<T> &, BuffersCUDA<T> &, Queue &) {
    return StatusCode::kUnknownError;
  }
  // Describes how to download the results of the computation (more importantly: which buffer)
  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
    std::vector<T> result(args.a_size, static_cast<T>(0));
    buffers.a_mat.Read(queue, args.a_size, result);
    return result;
  }
  // Describes how to compute the indices of the result buffer
  static size_t ResultID1(const Arguments<T> &args) { return args.height * args.width; }
  static size_t ResultID2(const Arguments<T> &args) { return args.channels; }
  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
    return id1 + args.height * args.width * id2 + args.a_offset;
  }
  // Describes how to compute performance metrics
  static size_t GetFlops(const Arguments<T> &) {
    return 1;
  }
  static size_t GetBytes(const Arguments<T> &args) {
    const auto im = args.channels * args.width * args.height; // possibly less with striding
    const auto col = args.kernel_h * args.kernel_w * NumPatches(args);
    return (im + col) * sizeof(T);
  }
 };
 // =================================================================================================
 template <typename T>
 StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
  // Reference taken from im2col but swapped the input/output
  const auto col_h = TestXcol2im<T>::ColHeight(args);
  const auto col_w = TestXcol2im<T>::ColWidth(args);
  for (auto c_id = size_t{0}; c_id < args.channels; ++c_id) { // image channels
    for (auto kh_id = size_t{0}; kh_id < args.kernel_h; ++kh_id) { // kernel height
      for (auto kw_id = size_t{0}; kw_id < args.kernel_w; ++kw_id) { // kernel width
        for (auto h_id = size_t{0}; h_id < col_h; ++h_id) { // image height
          for (auto w_id = size_t{0}; w_id < col_w; ++w_id) { // image width
            // Reads the input value
            const auto kernel_index = kw_id + args.kernel_w * kh_id;
            const auto patch_index = w_id + col_w * h_id;
            const auto col_index = patch_index + kernel_index * col_w * col_h +
                                   c_id * col_w * col_h * args.kernel_h * args.kernel_w;
            const auto val = buffers_host.b_mat[col_index + args.b_offset];
            // Sets the output value
            const auto h_index = kh_id * args.dilation_h + args.stride_h * h_id - args.pad_h;
            const auto w_index = kw_id * args.dilation_w + args.stride_w * w_id - args.pad_w;
            if (h_index >= 0 && h_index < args.height &&
                w_index >= 0 && w_index < args.width) {
              const auto im_index = w_index + args.width * (h_index + args.height * c_id);
              buffers_host.a_mat[im_index + args.a_offset] = val;
            }
          }
        }
      }
    }
  }
  return StatusCode::kSuccess;
 }
 // =================================================================================================
 } // namespace clblast
 // CLBLAST_TEST_ROUTINES_XCOL2IM_H_
 #endif