Changed temporary convgemm implementation to use batched-strided GEMM

2018-05-09 20:38:39 +02:00 · 2018-05-09 20:38:39 +02:00 · 4e6d30088d
parent b608280361
commit 4e6d30088d
1 changed files with 30 additions and 24 deletions
--- a/src/routines/levelx/xconvgemm.cpp
+++ b/src/routines/levelx/xconvgemm.cpp
@ -13,7 +13,7 @@

 #include "routines/levelx/xconvgemm.hpp"
 #include "routines/levelx/xim2col.hpp"
-#include "routines/level3/xgemm.hpp"
+#include "routines/levelx/xgemmstridedbatched.hpp"

 #include <string>
 #include <vector>
@ -54,45 +54,51 @@ void Xconvgemm<T>::DoConvgemm(const size_t channels, const size_t height, const
  const auto padding_w = dilation_w * (kernel_w - 1) + 1;
  const auto output_w = (size_w >= padding_w) ? (size_w - padding_w) / stride_w + 1 : 1;

-  // Temporary col matrix
+  // Sets other useful variables
  const auto patch_size = kernel_h * kernel_w * channels;
  const auto num_patches = output_h * output_w;
-  const auto col_size = patch_size * num_patches;
-  auto col_buffer = Buffer<T>(context_, col_size);

  // Approach: im2col + GEMM
  //      result = GEMM(im2col(image), kernel)
+
+  // Temporary col matrix
+  const auto col_size = patch_size * num_patches * batch_count;
+  auto col_buffer = Buffer<T>(context_, col_size);
+
+  // Loops over each batch
  for (auto batch_id = size_t{0}; batch_id < batch_count; ++batch_id) {

    // im2col
    const auto im_batch_offset = batch_id * channels * height * width + im_offset;
+    const auto col_batch_offset = batch_id * patch_size * num_patches;
    auto im2col_event = Event();
    auto im2col = Xim2col<T>(queue_, im2col_event.pointer());
    im2col.DoIm2col(channels, height, width, kernel_h, kernel_w,
                    pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
                    im_buffer, im_batch_offset,
-                    col_buffer, 0);
+                    col_buffer, col_batch_offset);
    im2col_event.WaitForCompletion();
-
-    // GEMM: C (result) = alpha (1) * A (col) * B (kernel) + beta (0) * C (result)
-    const auto m = num_patches;
-    const auto n = num_kernels;
-    const auto k = patch_size;
-    const auto col_gemm_offset = size_t{0}; // A
-    const auto kernel_gemm_offset = kernel_offset; // B
-    const auto result_gemm_offset = batch_id * num_kernels * output_h * output_w + result_offset; // C
-    const auto col_ld = m;
-    const auto kernel_ld = k;
-    const auto result_ld = m;
-    auto gemm_event = Event();
-    auto gemm = Xgemm<T>(queue_, gemm_event.pointer());
-    gemm.DoGemm(Layout::kColMajor, Transpose::kNo, Transpose::kNo,
-                m, n, k, ConstantOne<T>(),
-                col_buffer, col_gemm_offset, col_ld,
-                kernel_buffer, kernel_gemm_offset, kernel_ld, ConstantZero<T>(),
-                result_buffer, result_gemm_offset, result_ld);
-    gemm_event.WaitForCompletion();
  }
+
+  // GEMM: C (result) = alpha (1) * A (col) * B (kernel) + beta (0) * C (result)
+  const auto m = num_patches;
+  const auto n = num_kernels;
+  const auto k = patch_size;
+  const auto col_ld = m;
+  const auto kernel_ld = k;
+  const auto result_ld = m;
+  const auto col_stride = patch_size * num_patches;
+  const auto kernel_stride = size_t{0}; // applies the same kernel to all
+  const auto result_stride = num_kernels * output_h * output_w;
+  auto gemm_event = Event();
+  auto gemm = XgemmStridedBatched<T>(queue_, gemm_event.pointer());
+  gemm.DoGemmStridedBatched(Layout::kColMajor, Transpose::kNo, Transpose::kNo,
+                            m, n, k, ConstantOne<T>(),
+                            col_buffer, 0, col_ld, col_stride,
+                            kernel_buffer, kernel_offset, kernel_ld, kernel_stride, ConstantZero<T>(),
+                            result_buffer, result_offset, result_ld, result_stride,
+                            batch_count);
+  gemm_event.WaitForCompletion();
 }

 // =================================================================================================