Added a condition to update only lower/upper triangular parts in the un-pad kernels

2015-06-23 08:09:07 +02:00 · 2015-06-23 08:09:07 +02:00 · 20eb3506d6
parent 4c2a166bc5
commit 20eb3506d6
5 changed files with 38 additions and 15 deletions
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@ -92,7 +92,8 @@ class Routine {
                                    const size_t dest_ld, const size_t dest_offset,
                                    const Buffer &dest,
                                    const bool do_transpose, const bool do_conjugate,
-                                    const bool pad, const Program &program);
+                                    const bool pad, const bool upper, const bool lower,
+                                    const Program &program);
  
  // Queries the cache and retrieve either a matching program or a boolean whether a match exists.
  // The first assumes that the program is available in the cache and will throw an exception
--- a/src/kernels/pad.opencl
+++ b/src/kernels/pad.opencl
@ -86,7 +86,8 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
                          __global const real* restrict src,
                          const int dest_one, const int dest_two,
                          const int dest_ld, const int dest_offset,
-                          __global real* dest) {
+                          __global real* dest,
+                          const int upper, const int lower) {

  // Loops over the work per thread in both dimensions
  #pragma unroll
@ -95,11 +96,18 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
    #pragma unroll
    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_two && id_one < dest_one) {
+
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      if (upper == 1) { condition = (id_two >= id_one); }
+      else if (lower == 1) { condition = (id_two <= id_one); }
+      if (condition) {

        // Copies the value into the destination matrix. This is always within bounds of the source
        // matrix, as we know that the destination matrix is smaller than the source.
-        dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+        if (id_two < dest_two && id_one < dest_one) {
+          dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+        }
      }
    }
  }
--- a/src/kernels/padtranspose.opencl
+++ b/src/kernels/padtranspose.opencl
@ -100,7 +100,8 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
                                   __global const real* restrict src,
                                   const int dest_one, const int dest_two,
                                   const int dest_ld, const int dest_offset,
-                                   __global real* dest) {
+                                   __global real* dest,
+                                   const int upper, const int lower) {

  // Local memory to store a tile of the matrix (for coalescing)
  __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@ -137,10 +138,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
      const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
      const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);

-      // Stores the transposed value in the destination matrix
-      if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
-        real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
-        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
+      else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
+      if (condition) {
+
+        // Stores the transposed value in the destination matrix
+        if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+          real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+        }
      }
    }
  }
--- a/src/routine.cc
+++ b/src/routine.cc
@ -210,11 +210,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
                                           const size_t dest_ld, const size_t dest_offset,
                                           const Buffer &dest,
                                           const bool do_transpose, const bool do_conjugate,
-                                           const bool pad, const Program &program) {
+                                           const bool pad, const bool upper, const bool lower,
+                                           const Program &program) {

  // Determines whether or not the fast-version could potentially be used
  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
-                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
+                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
+                         (upper == false) && (lower == false);

  // Determines the right kernel
  auto kernel_name = std::string{};
@ -267,6 +269,10 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
      if (pad) {
        kernel.SetArgument(10, static_cast<int>(do_conjugate));
      }
+      else {
+        kernel.SetArgument(10, static_cast<int>(upper));
+        kernel.SetArgument(11, static_cast<int>(lower));
+      }
    }

    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
--- a/src/routines/xgemm.cc
+++ b/src/routines/xgemm.cc
@ -108,18 +108,18 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
    // them up until they reach a certain multiple of size (kernel parameter dependent).
    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
-                                    a_do_transpose, a_conjugate, true, program);
+                                    a_do_transpose, a_conjugate, true, false, false, program);
    if (ErrorIn(status)) { return status; }
    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
-                                    b_do_transpose, b_conjugate, true, program);
+                                    b_do_transpose, b_conjugate, true, false, false, program);
    if (ErrorIn(status)) { return status; }

    // Only necessary for matrix C if it used both as input and output
    if (beta != static_cast<T>(0)) {
      status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
-                                      c_do_transpose, false, true, program);
+                                      c_do_transpose, false, true, false, false, program);
      if (ErrorIn(status)) { return status; }
    }

@ -151,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
      // Runs the post-processing kernel
      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
                                      c_one, c_two, c_ld, c_offset, c_buffer,
-                                      c_do_transpose, false, false, program);
+                                      c_do_transpose, false, false, false, false, program);
      if (ErrorIn(status)) { return status; }

      // Successfully finished the computation