Added xCONVGEMM as im2col plus a batched GEMM kernel

2018-09-07 22:02:44 +02:00 · 2018-09-07 22:02:44 +02:00 · c788e040f7
parent 2dd539f911
commit c788e040f7
6 changed files with 11 additions and 2 deletions
--- a/2
+++ b/2
@ -3,6 +3,8 @@ Development (next version)
 - Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
 - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
 - Various minor fixes and enhancements
+- Added non-BLAS routines:
+  * SCONVGEMM/DCONVGEMM/HCONVGEMM (convolution as im2col followed by batched GEMM)

 Version 1.4.1
 - Fixed an access violation under Windows upon releasing the OpenCL program when the driver is already unloaded
--- a/doc/routines.md
+++ b/doc/routines.md
@ -93,8 +93,9 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
 | xHAD       | ✔ | ✔ | ✔ | ✔ | ✔ | (Hadamard product)
 | xOMATCOPY  | ✔ | ✔ | ✔ | ✔ | ✔ | (Out-of-place copying/transposing/scaling of matrices)
 | xIM2COL    | ✔ | ✔ | ✔ | ✔ | ✔ | (Image to column transform as used to express convolution as GEMM)
+| xCONVGEMM  | ✔ | ✔ | - | - | ✔ | (Experimental, implemented as im2col followed by batched GEMM)

-Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.
+Some less commonly used BLAS routines are not yet supported by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.


 Half precision (fp16)
--- a/src/kernels/levelx/xconvgemm_part1.opencl
+++ b/src/kernels/levelx/xconvgemm_part1.opencl
@ -11,6 +11,7 @@
 // uses parameters from the direct GEMM kernel. This is the part with the loads from memory (1/2).
 // This uses "CONVGEMM_WITH_IM2COL" as a switch to select between direct convgemm or first running
 // the im2col kernel to create a 'col' temporary matrix.
+// TODO: Currently only works with 'CONVGEMM_WITH_IM2COL' set
 //
 // =================================================================================================

--- a/src/kernels/levelx/xconvgemm_part2.opencl
+++ b/src/kernels/levelx/xconvgemm_part2.opencl
@ -11,6 +11,7 @@
 // uses parameters from the direct GEMM kernel. This part contains the main kernel (2/2).
 // This uses "CONVGEMM_WITH_IM2COL" as a switch to select between direct convgemm or first running
 // the im2col kernel to create a 'col' temporary matrix.
+// TODO: Currently only works with 'CONVGEMM_WITH_IM2COL' set
 //
 // =================================================================================================

--- a/src/routines/levelx/xconvgemm.cpp
+++ b/src/routines/levelx/xconvgemm.cpp
@ -13,6 +13,7 @@

 #include <string>
 #include <vector>
+#include <assert.h>

 #include "routines/levelx/xconvgemm.hpp"
 #include "routines/levelx/xim2col.hpp"
@ -51,6 +52,9 @@ void Xconvgemm<T>::DoConvgemm(const size_t channels, const size_t height, const
                              const Buffer<T> &kernel_buffer, const size_t kernel_offset,
                              const Buffer<T> &result_buffer, const size_t result_offset) {

+  // TODO: Implement single-kernel approach
+  assert(method_ == ConvGemmMethod::kWithIm2Col);
+
  // Tests for a valid batch count
  if (batch_count == 0) {
    throw BLASError(StatusCode::kInvalidBatchCount);
--- a/src/routines/levelx/xconvgemm.hpp
+++ b/src/routines/levelx/xconvgemm.hpp
@ -29,7 +29,7 @@ class Xconvgemm: public Routine {
  // Constructor
  enum class ConvGemmMethod {kWithIm2Col, kSingleKernel};
  Xconvgemm(Queue &queue, EventPointer event, const std::string &name = "CONVGEMM",
-            const ConvGemmMethod method = ConvGemmMethod::kSingleKernel);
+            const ConvGemmMethod method = ConvGemmMethod::kWithIm2Col);

  // Templated-precision implementation of the routine
  void DoConvgemm(const size_t channels, const size_t height, const size_t width,