Added groundwork for col2im algorithm plus first non-working version of kernel and test

pull/330/head
Cedric Nugteren 2018-10-23 20:52:25 +02:00
parent 44b630fc22
commit d45911b61d
19 changed files with 814 additions and 4 deletions

View File

@ -221,7 +221,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
set(LEVELX_ROUTINES xhad xomatcopy xim2col xconvgemm xaxpybatched xgemmbatched xgemmstridedbatched) set(LEVELX_ROUTINES xhad xomatcopy xim2col xcol2im xconvgemm xaxpybatched xgemmbatched xgemmstridedbatched)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
set(PRECISIONS 32 64 3232 6464 16) set(PRECISIONS 32 64 3232 6464 16)

View File

@ -3072,6 +3072,66 @@ Arguments to IM2COL:
xCOL2IM: Col2im function (non-BLAS function)
-------------
Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix.
C++ API:
```
template <typename T>
StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event)
```
C API:
```
CLBlastStatusCode CLBlastScol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastDcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastCcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastZcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastHcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event)
```
Arguments to COL2IM:
* `const size_t channels`: Integer size argument. This value must be positive.
* `const size_t height`: Integer size argument. This value must be positive.
* `const size_t width`: Integer size argument. This value must be positive.
* `const size_t kernel_h`: Integer size argument. This value must be positive.
* `const size_t kernel_w`: Integer size argument. This value must be positive.
* `const size_t pad_h`: Integer size argument. This value must be positive.
* `const size_t pad_w`: Integer size argument. This value must be positive.
* `const size_t stride_h`: Integer size argument. This value must be positive.
* `const size_t stride_w`: Integer size argument. This value must be positive.
* `const size_t dilation_h`: Integer size argument. This value must be positive.
* `const size_t dilation_w`: Integer size argument. This value must be positive.
* `const cl_mem col_buffer`: OpenCL buffer to store the input col tensor.
* `const size_t col_offset`: The offset in elements from the start of the input col tensor.
* `cl_mem im_buffer`: OpenCL buffer to store the output im tensor.
* `const size_t im_offset`: The offset in elements from the start of the output im tensor.
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
xCONVGEMM: Batched convolution as GEMM (non-BLAS function) xCONVGEMM: Batched convolution as GEMM (non-BLAS function)
------------- -------------

View File

@ -636,6 +636,13 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width
cl_mem col_buffer, const size_t col_offset, cl_mem col_buffer, const size_t col_offset,
cl_command_queue* queue, cl_event* event = nullptr); cl_command_queue* queue, cl_event* event = nullptr);
// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
template <typename T>
StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event = nullptr);
// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
template <typename T> template <typename T>
StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,

View File

@ -1410,6 +1410,28 @@ CLBlastStatusCode PUBLIC_API CLBlastHim2col(const size_t channels, const size_t
cl_mem col_buffer, const size_t col_offset, cl_mem col_buffer, const size_t col_offset,
cl_command_queue* queue, cl_event* event); cl_command_queue* queue, cl_event* event);
// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
CLBlastStatusCode PUBLIC_API CLBlastScol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event);
// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
CLBlastStatusCode PUBLIC_API CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, CLBlastStatusCode PUBLIC_API CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
const cl_mem im_buffer, const size_t im_offset, const cl_mem im_buffer, const size_t im_offset,

View File

@ -608,6 +608,13 @@ StatusCode Im2col(const size_t channels, const size_t height, const size_t width
CUdeviceptr col_buffer, const size_t col_offset, CUdeviceptr col_buffer, const size_t col_offset,
const CUcontext context, const CUdevice device); const CUcontext context, const CUdevice device);
// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
template <typename T>
StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const CUdeviceptr col_buffer, const size_t col_offset,
CUdeviceptr im_buffer, const size_t im_offset,
const CUcontext context, const CUdevice device);
// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
template <typename T> template <typename T>
StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,

View File

@ -960,6 +960,20 @@ void PUBLIC_API cblas_zim2col(const int channels, const int height, const int wi
const void* im, const void* im,
void* col); void* col);
// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
void PUBLIC_API cblas_scol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const float* col,
float* im);
void PUBLIC_API cblas_dcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const double* col,
double* im);
void PUBLIC_API cblas_ccol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const void* col,
void* im);
void PUBLIC_API cblas_zcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const void* col,
void* im);
// ================================================================================================= // =================================================================================================
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -181,6 +181,7 @@ ROUTINES = [
Routine(True, True, 0, False, "x", "had", T, [S,D,C,Z,H], ["n"], [], ["x","y"], ["z"], [xn,yn,zn], ["alpha","beta"], "", "Element-wise vector product (Hadamard)", "Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants.", []), Routine(True, True, 0, False, "x", "had", T, [S,D,C,Z,H], ["n"], [], ["x","y"], ["z"], [xn,yn,zn], ["alpha","beta"], "", "Element-wise vector product (Hadamard)", "Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants.", []),
Routine(True, True, 0, False, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), Routine(True, True, 0, False, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
Routine(True, True, 0, False, "x", "im2col", T, [S,D,C,Z,H], im2col_constants, [], ["im"], ["col"], [im,col], [""], "", "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.", []), Routine(True, True, 0, False, "x", "im2col", T, [S,D,C,Z,H], im2col_constants, [], ["im"], ["col"], [im,col], [""], "", "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.", []),
Routine(True, True, 0, False, "x", "col2im", T, [S,D,C,Z,H], im2col_constants, [], ["col"], ["im"], [col,im], [""], "", "Col2im function (non-BLAS function)", "Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix.", []),
Routine(True, True, 0, False, "x", "convgemm", T, [S,D,H], convgemm_constants, [], ["im","kernel"], ["result"], [imb,kernel,result],[""], "", "Batched convolution as GEMM (non-BLAS function)", "Integrates im2col and GEMM for batched 3D convolution, in which _im_ is the 4D input tensor (NCHW - batch-channelin-height-width), _kernel_ the 4D kernel weights tensor (KCHW - channelout-channelin-height-width), and _result_ the 4D output tensor (NCHW - batch-channelout-height-width).", []), Routine(True, True, 0, False, "x", "convgemm", T, [S,D,H], convgemm_constants, [], ["im","kernel"], ["result"], [imb,kernel,result],[""], "", "Batched convolution as GEMM (non-BLAS function)", "Integrates im2col and GEMM for batched 3D convolution, in which _im_ is the 4D input tensor (NCHW - batch-channelin-height-width), _kernel_ the 4D kernel weights tensor (KCHW - channelout-channelin-height-width), and _result_ the 4D output tensor (NCHW - batch-channelout-height-width).", []),
# Batched routines: # Batched routines:
Routine(True, True, 1, False, "x", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []), Routine(True, True, 1, False, "x", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []),

View File

@ -205,7 +205,7 @@ class Routine:
def no_scalars(self): def no_scalars(self):
"""Determines whether or not this routine has scalar arguments (alpha/beta)""" """Determines whether or not this routine has scalar arguments (alpha/beta)"""
return self.scalars == [] or self.name in ["im2col", "convgemm"] return self.scalars == [] or self.name in ["im2col", "col2im", "convgemm"]
def has_layout(self): def has_layout(self):
"""Determines whether the layout is an argument""" """Determines whether the layout is an argument"""
@ -226,12 +226,14 @@ class Routine:
"""Determines which buffers go first (between alpha and beta) and which ones go after""" """Determines which buffers go first (between alpha and beta) and which ones go after"""
if self.level == "2b" or self.name == "had": if self.level == "2b" or self.name == "had":
return ["x", "y"] return ["x", "y"]
return ["ap", "a", "b", "x", "im", "kernel"] extra_buffer = "col" if self.name == "col2im" else "im"
return ["ap", "a", "b", "x", extra_buffer, "kernel"]
def buffers_second(self): def buffers_second(self):
if self.level == "2b" or self.name == "had": if self.level == "2b" or self.name == "had":
return ["z", "ap", "a", "b", "c"] return ["z", "ap", "a", "b", "c"]
return ["y", "c", "col", "result"] extra_buffer = "im" if self.name == "col2im" else "col"
return ["y", "c", extra_buffer, "result"]
def buffer(self, name): def buffer(self, name):
"""Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')""" """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""

View File

@ -2252,6 +2252,42 @@ template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const si
cl_mem, const size_t, cl_mem, const size_t,
cl_command_queue*, cl_event*); cl_command_queue*, cl_event*);
// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
template <typename T>
StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event) {
try {
auto queue_cpp = Queue(*queue);
auto routine = Xcol2im<T>(queue_cpp, event);
routine.DoCol2im(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
Buffer<T>(col_buffer), col_offset,
Buffer<T>(im_buffer), im_offset);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Col2im<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Col2im<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Col2im<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Col2im<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Col2im<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
const cl_mem, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
template <typename T> template <typename T>
StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,

View File

@ -3679,6 +3679,73 @@ CLBlastStatusCode CLBlastHim2col(const size_t channels, const size_t height, con
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
} }
// COL2IM
CLBlastStatusCode CLBlastScol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Col2im<float>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
col_buffer, col_offset,
im_buffer, im_offset,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastDcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Col2im<double>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
col_buffer, col_offset,
im_buffer, im_offset,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastCcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Col2im<float2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
col_buffer, col_offset,
im_buffer, im_offset,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastZcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Col2im<double2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
col_buffer, col_offset,
im_buffer, im_offset,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastHcol2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const cl_mem col_buffer, const size_t col_offset,
cl_mem im_buffer, const size_t im_offset,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Col2im<half>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
col_buffer, col_offset,
im_buffer, im_offset,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// CONVGEMM // CONVGEMM
CLBlastStatusCode CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, CLBlastStatusCode CLBlastSconvgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
const cl_mem im_buffer, const size_t im_offset, const cl_mem im_buffer, const size_t im_offset,

View File

@ -2350,6 +2350,44 @@ template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const si
CUdeviceptr, const size_t, CUdeviceptr, const size_t,
const CUcontext, const CUdevice); const CUcontext, const CUdevice);
// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
template <typename T>
StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
const CUdeviceptr col_buffer, const size_t col_offset,
CUdeviceptr im_buffer, const size_t im_offset,
const CUcontext context, const CUdevice device) {
try {
const auto context_cpp = Context(context);
const auto device_cpp = Device(device);
auto queue_cpp = Queue(context_cpp, device_cpp);
auto routine = Xcol2im<T>(queue_cpp, nullptr);
routine.DoCol2im(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
Buffer<T>(col_buffer), col_offset,
Buffer<T>(im_buffer), im_offset);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Col2im<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
const CUdeviceptr, const size_t,
CUdeviceptr, const size_t,
const CUcontext, const CUdevice);
template StatusCode PUBLIC_API Col2im<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
const CUdeviceptr, const size_t,
CUdeviceptr, const size_t,
const CUcontext, const CUdevice);
template StatusCode PUBLIC_API Col2im<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
const CUdeviceptr, const size_t,
CUdeviceptr, const size_t,
const CUcontext, const CUdevice);
template StatusCode PUBLIC_API Col2im<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
const CUdeviceptr, const size_t,
CUdeviceptr, const size_t,
const CUcontext, const CUdevice);
template StatusCode PUBLIC_API Col2im<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
const CUdeviceptr, const size_t,
CUdeviceptr, const size_t,
const CUcontext, const CUdevice);
// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM // Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
template <typename T> template <typename T>
StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count, StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,

View File

@ -4967,4 +4967,94 @@ void cblas_zim2col(const int channels, const int height, const int width, const
col_buffer.Read(queue, col_size, reinterpret_cast<double2*>(col)); col_buffer.Read(queue, col_size, reinterpret_cast<double2*>(col));
} }
// COL2IM
void cblas_scol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const float* col,
float* im) {
OPTIONAL_STATIC auto device = get_device();
OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto col_size = height * width * channels;
const auto im_size = height * width * channels;
auto col_buffer = clblast::Buffer<float>(context, col_size);
auto im_buffer = clblast::Buffer<float>(context, im_size);
col_buffer.Write(queue, col_size, reinterpret_cast<const float*>(col));
im_buffer.Write(queue, im_size, reinterpret_cast<float*>(im));
auto queue_cl = queue();
auto s = clblast::Col2im<float>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
col_buffer(), 0,
im_buffer(), 0,
&queue_cl);
if (s != clblast::StatusCode::kSuccess) {
throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
}
im_buffer.Read(queue, im_size, reinterpret_cast<float*>(im));
}
void cblas_dcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const double* col,
double* im) {
OPTIONAL_STATIC auto device = get_device();
OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto col_size = height * width * channels;
const auto im_size = height * width * channels;
auto col_buffer = clblast::Buffer<double>(context, col_size);
auto im_buffer = clblast::Buffer<double>(context, im_size);
col_buffer.Write(queue, col_size, reinterpret_cast<const double*>(col));
im_buffer.Write(queue, im_size, reinterpret_cast<double*>(im));
auto queue_cl = queue();
auto s = clblast::Col2im<double>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
col_buffer(), 0,
im_buffer(), 0,
&queue_cl);
if (s != clblast::StatusCode::kSuccess) {
throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
}
im_buffer.Read(queue, im_size, reinterpret_cast<double*>(im));
}
void cblas_ccol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const void* col,
void* im) {
OPTIONAL_STATIC auto device = get_device();
OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto col_size = height * width * channels;
const auto im_size = height * width * channels;
auto col_buffer = clblast::Buffer<float2>(context, col_size);
auto im_buffer = clblast::Buffer<float2>(context, im_size);
col_buffer.Write(queue, col_size, reinterpret_cast<const float2*>(col));
im_buffer.Write(queue, im_size, reinterpret_cast<float2*>(im));
auto queue_cl = queue();
auto s = clblast::Col2im<float2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
col_buffer(), 0,
im_buffer(), 0,
&queue_cl);
if (s != clblast::StatusCode::kSuccess) {
throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
}
im_buffer.Read(queue, im_size, reinterpret_cast<float2*>(im));
}
void cblas_zcol2im(const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const void* col,
void* im) {
OPTIONAL_STATIC auto device = get_device();
OPTIONAL_STATIC auto context = clblast::Context(device);
auto queue = clblast::Queue(context, device);
const auto col_size = height * width * channels;
const auto im_size = height * width * channels;
auto col_buffer = clblast::Buffer<double2>(context, col_size);
auto im_buffer = clblast::Buffer<double2>(context, im_size);
col_buffer.Write(queue, col_size, reinterpret_cast<const double2*>(col));
im_buffer.Write(queue, im_size, reinterpret_cast<double2*>(im));
auto queue_cl = queue();
auto s = clblast::Col2im<double2>(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
col_buffer(), 0,
im_buffer(), 0,
&queue_cl);
if (s != clblast::StatusCode::kSuccess) {
throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
}
im_buffer.Read(queue, im_size, reinterpret_cast<double2*>(im));
}
// ================================================================================================= // =================================================================================================

View File

@ -0,0 +1,74 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// This file contains the col2im kernel, taken from:
// https://gist.github.com/vbkaisetsu/a98299df827f9a5245635f646c1d94be
// Credits go to https://github.com/vbkaisetsu
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// Work-group size parameters re-used from the 'copy' kernel
#ifndef COPY_DIMX
#define COPY_DIMX 8 // Local workgroup size in the first dimension (w)
#endif
#ifndef COPY_DIMY
#define COPY_DIMY 8 // Local workgroup size in the second dimension (h)
#endif
// =================================================================================================
__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
void col2im(const int input_h, const int input_w, const int channels,
const int output_h, const int output_w,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const __global real* restrict col_buffer, const int col_offset,
__global real *im_buffer, const int im_offset) {
const int x_x = get_global_id(0) + pad_w;
const int x_y = ((int) get_global_id(1)) % input_h + pad_h;
const int channel = ((int) get_global_id(1)) / input_h;
const int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
const int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
const int col_channel_shift = channel * kernel_w * kernel_h * output_h * output_w + col_offset;
const int x_channel_shift = channel * input_h * input_w + im_offset;
const int t_y_begin = (x_y < kernel_extent_h) ? 0 : (x_y - kernel_extent_h) / stride_h + 1;
const int t_y_end = min(x_y / stride_h + 1, output_h);
const int t_x_begin = (x_x < kernel_extent_w) ? 0 : (x_x - kernel_extent_w) / stride_w + 1;
const int t_x_end = min(x_x / stride_w + 1, output_w);
if (x_x < input_w + pad_w && channel < channels) {
real val;
SetToZero(val);
for (int t_y = t_y_begin; t_y < t_y_end; ++t_y) {
for (int t_x = t_x_begin; t_x < t_x_end; ++t_x) {
int w_y = x_y - t_y * stride_h;
int w_x = x_x - t_x * stride_w;
if (w_y % dilation_h == 0 && w_x % dilation_w == 0) {
w_y /= dilation_h;
w_x /= dilation_w;
val += col_buffer[col_channel_shift
+ (w_x + w_y * kernel_w) * output_h * output_w
+ t_y * output_w
+ t_x];
}
}
}
im_buffer[x_channel_shift + (x_y - pad_h) * input_w + x_x - pad_w] = val;
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View File

@ -0,0 +1,92 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xcol2im class (see the header for information about the class).
//
// =================================================================================================
#include "routines/levelx/xcol2im.hpp"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xcol2im<T>::Xcol2im(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Copy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/levelx/col2im.opencl"
}) {
}
// =================================================================================================
// The main routine
template <typename T>
void Xcol2im<T>::DoCol2im(const size_t channels, const size_t height, const size_t width,
const size_t kernel_h, const size_t kernel_w, const size_t pad_h,
const size_t pad_w, const size_t stride_h, const size_t stride_w,
const size_t dilation_h, const size_t dilation_w,
const Buffer<T> &col_buffer, const size_t col_offset,
const Buffer<T> &im_buffer, const size_t im_offset) {
// Makes sure all dimensions are larger than zero
if ((channels == 0) || (height == 0) || (width == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Sets the output height and width
const auto size_h = height + 2 * pad_h;
const auto padding_h = dilation_h * (kernel_h - 1) + 1;
const auto col_h = (size_h >= padding_h) ? (size_h - padding_h) / stride_h + 1 : 1;
const auto size_w = width + 2 * pad_w;
const auto padding_w = dilation_w * (kernel_w - 1) + 1;
const auto col_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;
// Retrieves the kernel from the compiled binary
auto kernel = Kernel(program_, "col2im");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(height));
kernel.SetArgument(1, static_cast<int>(width));
kernel.SetArgument(2, static_cast<int>(channels));
kernel.SetArgument(3, static_cast<int>(col_h));
kernel.SetArgument(4, static_cast<int>(col_w));
kernel.SetArgument(5, static_cast<int>(kernel_h));
kernel.SetArgument(6, static_cast<int>(kernel_w));
kernel.SetArgument(7, static_cast<int>(pad_h));
kernel.SetArgument(8, static_cast<int>(pad_w));
kernel.SetArgument(9, static_cast<int>(stride_h));
kernel.SetArgument(10, static_cast<int>(stride_w));
kernel.SetArgument(11, static_cast<int>(dilation_h));
kernel.SetArgument(12, static_cast<int>(dilation_w));
kernel.SetArgument(13, col_buffer());
kernel.SetArgument(14, static_cast<int>(col_offset));
kernel.SetArgument(15, im_buffer());
kernel.SetArgument(16, static_cast<int>(im_offset));
// Launches the kernel
const auto w_ceiled = Ceil(col_w, db_["COPY_DIMX"]);
const auto h_ceiled = Ceil(col_h, db_["COPY_DIMY"]);
const auto global = std::vector<size_t>{w_ceiled, h_ceiled * channels};
const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================
// Compiles the templated class
template class Xcol2im<half>;
template class Xcol2im<float>;
template class Xcol2im<double>;
template class Xcol2im<float2>;
template class Xcol2im<double2>;
// =================================================================================================
} // namespace clblast

View File

@ -0,0 +1,45 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xcol2im routine. The precision is implemented using a template argument.
// Uses the tuning parameters from the regular copy kernel.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XCOL2IM_H_
#define CLBLAST_ROUTINES_XCOL2IM_H_
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xcol2im: public Routine {
public:
// Constructor
Xcol2im(Queue &queue, EventPointer event, const std::string &name = "COL2IM");
// Templated-precision implementation of the routine
void DoCol2im(const size_t channels, const size_t height, const size_t width,
const size_t kernel_h, const size_t kernel_w,
const size_t pad_h, const size_t pad_w,
const size_t stride_h, const size_t stride_w,
const size_t dilation_h, const size_t dilation_w,
const Buffer<T> &col_buffer, const size_t col_offset,
const Buffer<T> &im_buffer, const size_t im_offset);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XCOL2IM_H_
#endif

View File

@ -70,6 +70,7 @@
#include "routines/levelx/xhad.hpp" #include "routines/levelx/xhad.hpp"
#include "routines/levelx/xomatcopy.hpp" #include "routines/levelx/xomatcopy.hpp"
#include "routines/levelx/xim2col.hpp" #include "routines/levelx/xim2col.hpp"
#include "routines/levelx/xcol2im.hpp"
#include "routines/levelx/xconvgemm.hpp" #include "routines/levelx/xconvgemm.hpp"
#include "routines/levelx/xaxpybatched.hpp" #include "routines/levelx/xaxpybatched.hpp"
#include "routines/levelx/xgemmbatched.hpp" #include "routines/levelx/xgemmbatched.hpp"

View File

@ -0,0 +1,26 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "test/correctness/testblas.hpp"
#include "test/routines/levelx/xcol2im.hpp"
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
auto errors = size_t{0};
errors += clblast::RunTests<clblast::TestXcol2im<float>, float, float>(argc, argv, false, "SCOL2IM");
errors += clblast::RunTests<clblast::TestXcol2im<double>, double, double>(argc, argv, true, "DCOL2IM");
errors += clblast::RunTests<clblast::TestXcol2im<clblast::float2>, clblast::float2, clblast::float2>(argc, argv, true, "CCOL2IM");
errors += clblast::RunTests<clblast::TestXcol2im<clblast::double2>, clblast::double2, clblast::double2>(argc, argv, true, "ZCOL2IM");
errors += clblast::RunTests<clblast::TestXcol2im<clblast::half>, clblast::half, clblast::half>(argc, argv, true, "HCOL2IM");
if (errors > 0) { return 1; } else { return 0; }
}
// =================================================================================================

View File

@ -0,0 +1,33 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// =================================================================================================
#include "test/performance/client.hpp"
#include "test/routines/levelx/xcol2im.hpp"
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf:
clblast::RunClient<clblast::TestXcol2im<clblast::half>, clblast::half, clblast::half>(argc, argv); break;
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXcol2im<float>, float, float>(argc, argv); break;
case clblast::Precision::kDouble:
clblast::RunClient<clblast::TestXcol2im<double>, double, double>(argc, argv); break;
case clblast::Precision::kComplexSingle:
clblast::RunClient<clblast::TestXcol2im<clblast::float2>, clblast::float2, clblast::float2>(argc, argv); break;
case clblast::Precision::kComplexDouble:
clblast::RunClient<clblast::TestXcol2im<clblast::double2>, clblast::double2, clblast::double2>(argc, argv); break;
}
return 0;
}
// =================================================================================================

View File

@ -0,0 +1,195 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements a class with static methods to describe the Xcol2im routine. Examples of
// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
// static methods are used by the correctness tester and the performance tester.
//
// =================================================================================================
#ifndef CLBLAST_TEST_ROUTINES_XCOL2IM_H_
#define CLBLAST_TEST_ROUTINES_XCOL2IM_H_
#include "test/routines/common.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TestXcol2im {
public:
// The BLAS level: 4 for the extra routines
static size_t BLASLevel() { return 4; }
// The list of arguments relevant for this routine
static std::vector<std::string> GetOptions() {
return {kArgChannels, kArgHeight, kArgWidth, kArgKernelH, kArgKernelW, kArgPadH, kArgPadW,
kArgStrideH, kArgStrideW, kArgDilationH, kArgDilationW,
kArgAOffset, kArgBOffset};
}
static std::vector<std::string> BuffersIn() { return {kBufMatA, kBufMatB}; } // b = col
static std::vector<std::string> BuffersOut() { return {kBufMatA}; } // a = im
// Describes how to obtain the sizes of the buffers
static size_t ColHeight(const Arguments<T> &args) {
const auto size = args.height + 2 * args.pad_h;
const auto padding = args.dilation_h * (args.kernel_h - 1) + 1;
if (size >= padding) { return (size - padding) / args.stride_h + 1; }
return 1;
}
static size_t ColWidth(const Arguments<T> &args) {
const auto size = args.width + 2 * args.pad_w;
const auto padding = args.dilation_w * (args.kernel_w - 1) + 1;
if (size >= padding) { return (size - padding) / args.stride_w + 1; }
return 1;
}
static size_t NumPatches(const Arguments<T> &args) {
return ColHeight(args) * ColWidth(args) * args.channels;
}
static size_t GetSizeA(const Arguments<T> &args) {
return args.height * args.width * args.channels + args.a_offset;
}
static size_t GetSizeB(const Arguments<T> &args) {
return args.kernel_w * args.kernel_h * NumPatches(args) + args.b_offset;
}
// Describes how to set the sizes of all the buffers
static void SetSizes(Arguments<T> &args, Queue&) {
args.a_size = GetSizeA(args); // im
args.b_size = GetSizeB(args); // col
}
// Describes what the default values of the leading dimensions of the matrices are
static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
// Describes which transpose options are relevant for this routine
using Transposes = std::vector<Transpose>;
static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine
static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
// Describes how to prepare the input data
static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&,
std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&,
std::vector<T>&, std::vector<T>&) {} // N/A for this routine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
#ifdef OPENCL_API
auto queue_plain = queue();
auto event = cl_event{};
auto status = Col2im<T>(args.channels, args.height, args.width,
args.kernel_h, args.kernel_w,
args.pad_h, args.pad_w,
args.stride_h, args.stride_w,
args.dilation_h, args.dilation_w,
buffers.b_mat(), args.b_offset, // col
buffers.a_mat(), args.a_offset, // im
&queue_plain, &event);
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
#elif CUDA_API
auto status = Col2im<T>(args.channels, args.height, args.width,
args.kernel_h, args.kernel_w,
args.pad_h, args.pad_w,
args.stride_h, args.stride_w,
args.dilation_h, args.dilation_w,
buffers.b_mat(), args.b_offset, // col
buffers.a_mat(), args.a_offset, // im
queue.GetContext()(), queue.GetDevice()());
cuStreamSynchronize(queue());
#endif
return status;
}
// Describes how to run a naive version of the routine (for correctness/performance comparison).
// Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
auto buffers_host = BuffersHost<T>();
DeviceToHost(args, buffers, buffers_host, queue, BuffersIn());
const auto status = RunReference(args, buffers_host);
HostToDevice(args, buffers, buffers_host, queue, BuffersOut());
return status;
}
static StatusCode RunReference2(const Arguments<T> &args, BuffersHost<T> &buffers_host, Queue&) {
return RunReference(args, buffers_host);
}
static StatusCode RunReference3(const Arguments<T> &, BuffersCUDA<T> &, Queue &) {
return StatusCode::kUnknownError;
}
// Describes how to download the results of the computation (more importantly: which buffer)
static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
std::vector<T> result(args.a_size, static_cast<T>(0));
buffers.a_mat.Read(queue, args.a_size, result);
return result;
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &args) { return args.height * args.width; }
static size_t ResultID2(const Arguments<T> &args) { return args.channels; }
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
return id1 + args.height * args.width * id2 + args.a_offset;
}
// Describes how to compute performance metrics
static size_t GetFlops(const Arguments<T> &) {
return 1;
}
static size_t GetBytes(const Arguments<T> &args) {
const auto im = args.channels * args.width * args.height; // possibly less with striding
const auto col = args.kernel_h * args.kernel_w * NumPatches(args);
return (im + col) * sizeof(T);
}
};
// =================================================================================================
template <typename T>
StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) {
// Reference taken from im2col but swapped the input/output
const auto col_h = TestXcol2im<T>::ColHeight(args);
const auto col_w = TestXcol2im<T>::ColWidth(args);
for (auto c_id = size_t{0}; c_id < args.channels; ++c_id) { // image channels
for (auto kh_id = size_t{0}; kh_id < args.kernel_h; ++kh_id) { // kernel height
for (auto kw_id = size_t{0}; kw_id < args.kernel_w; ++kw_id) { // kernel width
for (auto h_id = size_t{0}; h_id < col_h; ++h_id) { // image height
for (auto w_id = size_t{0}; w_id < col_w; ++w_id) { // image width
// Reads the input value
const auto kernel_index = kw_id + args.kernel_w * kh_id;
const auto patch_index = w_id + col_w * h_id;
const auto col_index = patch_index + kernel_index * col_w * col_h +
c_id * col_w * col_h * args.kernel_h * args.kernel_w;
const auto val = buffers_host.b_mat[col_index + args.b_offset];
// Sets the output value
const auto h_index = kh_id * args.dilation_h + args.stride_h * h_id - args.pad_h;
const auto w_index = kw_id * args.dilation_w + args.stride_w * w_id - args.pad_w;
if (h_index >= 0 && h_index < args.height &&
w_index >= 0 && w_index < args.width) {
const auto im_index = w_index + args.width * (h_index + args.height * c_id);
buffers_host.a_mat[im_index + args.a_offset] = val;
}
}
}
}
}
}
return StatusCode::kSuccess;
}
// =================================================================================================
} // namespace clblast
// CLBLAST_TEST_ROUTINES_XCOL2IM_H_
#endif