diff --git a/src/routines/levelx/xconvgemm.cpp b/src/routines/levelx/xconvgemm.cpp index 0e59b5be..d3b198a2 100644 --- a/src/routines/levelx/xconvgemm.cpp +++ b/src/routines/levelx/xconvgemm.cpp @@ -13,7 +13,7 @@ #include "routines/levelx/xconvgemm.hpp" #include "routines/levelx/xim2col.hpp" -#include "routines/level3/xgemm.hpp" +#include "routines/levelx/xgemmstridedbatched.hpp" #include #include @@ -54,45 +54,51 @@ void Xconvgemm::DoConvgemm(const size_t channels, const size_t height, const const auto padding_w = dilation_w * (kernel_w - 1) + 1; const auto output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1; - // Temporary col matrix + // Sets other useful variables const auto patch_size = kernel_h * kernel_w * channels; const auto num_patches = output_h * output_w; - const auto col_size = patch_size * num_patches; - auto col_buffer = Buffer(context_, col_size); // Approach: im2col + GEMM // result = GEMM(im2col(image), kernel) + + // Temporary col matrix + const auto col_size = patch_size * num_patches * batch_count; + auto col_buffer = Buffer(context_, col_size); + + // Loops over each batch for (auto batch_id = size_t{0}; batch_id < batch_count; ++batch_id) { // im2col const auto im_batch_offset = batch_id * channels * height * width + im_offset; + const auto col_batch_offset = batch_id * patch_size * num_patches; auto im2col_event = Event(); auto im2col = Xim2col(queue_, im2col_event.pointer()); im2col.DoIm2col(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, im_buffer, im_batch_offset, - col_buffer, 0); + col_buffer, col_batch_offset); im2col_event.WaitForCompletion(); - - // GEMM: C (result) = alpha (1) * A (col) * B (kernel) + beta (0) * C (result) - const auto m = num_patches; - const auto n = num_kernels; - const auto k = patch_size; - const auto col_gemm_offset = size_t{0}; // A - const auto kernel_gemm_offset = kernel_offset; // B - const auto result_gemm_offset = batch_id * num_kernels * output_h * output_w + result_offset; // C - const auto col_ld = m; - const auto kernel_ld = k; - const auto result_ld = m; - auto gemm_event = Event(); - auto gemm = Xgemm(queue_, gemm_event.pointer()); - gemm.DoGemm(Layout::kColMajor, Transpose::kNo, Transpose::kNo, - m, n, k, ConstantOne(), - col_buffer, col_gemm_offset, col_ld, - kernel_buffer, kernel_gemm_offset, kernel_ld, ConstantZero(), - result_buffer, result_gemm_offset, result_ld); - gemm_event.WaitForCompletion(); } + + // GEMM: C (result) = alpha (1) * A (col) * B (kernel) + beta (0) * C (result) + const auto m = num_patches; + const auto n = num_kernels; + const auto k = patch_size; + const auto col_ld = m; + const auto kernel_ld = k; + const auto result_ld = m; + const auto col_stride = patch_size * num_patches; + const auto kernel_stride = size_t{0}; // applies the same kernel to all + const auto result_stride = num_kernels * output_h * output_w; + auto gemm_event = Event(); + auto gemm = XgemmStridedBatched(queue_, gemm_event.pointer()); + gemm.DoGemmStridedBatched(Layout::kColMajor, Transpose::kNo, Transpose::kNo, + m, n, k, ConstantOne(), + col_buffer, 0, col_ld, col_stride, + kernel_buffer, kernel_offset, kernel_ld, kernel_stride, ConstantZero(), + result_buffer, result_offset, result_ld, result_stride, + batch_count); + gemm_event.WaitForCompletion(); } // =================================================================================================