Added xCONVGEMM as im2col plus a batched GEMM kernel
parent
2dd539f911
commit
c788e040f7
|
@ -3,6 +3,8 @@ Development (next version)
|
|||
- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
|
||||
- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
|
||||
- Various minor fixes and enhancements
|
||||
- Added non-BLAS routines:
|
||||
* SCONVGEMM/DCONVGEMM/HCONVGEMM (convolution as im2col followed by batched GEMM)
|
||||
|
||||
Version 1.4.1
|
||||
- Fixed an access violation under Windows upon releasing the OpenCL program when the driver is already unloaded
|
||||
|
|
|
@ -93,8 +93,9 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
|
|||
| xHAD | ✔ | ✔ | ✔ | ✔ | ✔ | (Hadamard product)
|
||||
| xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ | (Out-of-place copying/transposing/scaling of matrices)
|
||||
| xIM2COL | ✔ | ✔ | ✔ | ✔ | ✔ | (Image to column transform as used to express convolution as GEMM)
|
||||
| xCONVGEMM | ✔ | ✔ | - | - | ✔ | (Experimental, implemented as im2col followed by batched GEMM)
|
||||
|
||||
Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.
|
||||
Some less commonly used BLAS routines are not yet supported by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.
|
||||
|
||||
|
||||
Half precision (fp16)
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
// uses parameters from the direct GEMM kernel. This is the part with the loads from memory (1/2).
|
||||
// This uses "CONVGEMM_WITH_IM2COL" as a switch to select between direct convgemm or first running
|
||||
// the im2col kernel to create a 'col' temporary matrix.
|
||||
// TODO: Currently only works with 'CONVGEMM_WITH_IM2COL' set
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
// uses parameters from the direct GEMM kernel. This part contains the main kernel (2/2).
|
||||
// This uses "CONVGEMM_WITH_IM2COL" as a switch to select between direct convgemm or first running
|
||||
// the im2col kernel to create a 'col' temporary matrix.
|
||||
// TODO: Currently only works with 'CONVGEMM_WITH_IM2COL' set
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <assert.h>
|
||||
|
||||
#include "routines/levelx/xconvgemm.hpp"
|
||||
#include "routines/levelx/xim2col.hpp"
|
||||
|
@ -51,6 +52,9 @@ void Xconvgemm<T>::DoConvgemm(const size_t channels, const size_t height, const
|
|||
const Buffer<T> &kernel_buffer, const size_t kernel_offset,
|
||||
const Buffer<T> &result_buffer, const size_t result_offset) {
|
||||
|
||||
// TODO: Implement single-kernel approach
|
||||
assert(method_ == ConvGemmMethod::kWithIm2Col);
|
||||
|
||||
// Tests for a valid batch count
|
||||
if (batch_count == 0) {
|
||||
throw BLASError(StatusCode::kInvalidBatchCount);
|
||||
|
|
|
@ -29,7 +29,7 @@ class Xconvgemm: public Routine {
|
|||
// Constructor
|
||||
enum class ConvGemmMethod {kWithIm2Col, kSingleKernel};
|
||||
Xconvgemm(Queue &queue, EventPointer event, const std::string &name = "CONVGEMM",
|
||||
const ConvGemmMethod method = ConvGemmMethod::kSingleKernel);
|
||||
const ConvGemmMethod method = ConvGemmMethod::kWithIm2Col);
|
||||
|
||||
// Templated-precision implementation of the routine
|
||||
void DoConvgemm(const size_t channels, const size_t height, const size_t width,
|
||||
|
|
Loading…
Reference in New Issue