Added a condition to update only lower/upper triangular parts in the un-pad kernels

pull/8/head
CNugteren 2015-06-23 08:09:07 +02:00
parent 4c2a166bc5
commit 20eb3506d6
5 changed files with 38 additions and 15 deletions

View File

@ -92,7 +92,8 @@ class Routine {
const size_t dest_ld, const size_t dest_offset,
const Buffer &dest,
const bool do_transpose, const bool do_conjugate,
const bool pad, const Program &program);
const bool pad, const bool upper, const bool lower,
const Program &program);
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
// The first assumes that the program is available in the cache and will throw an exception

View File

@ -86,7 +86,8 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest) {
__global real* dest,
const int upper, const int lower) {
// Loops over the work per thread in both dimensions
#pragma unroll
@ -95,11 +96,18 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
#pragma unroll
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
if (id_two < dest_two && id_one < dest_one) {
// Masking in case of triangular matrices: updates only the upper or lower part
bool condition = true;
if (upper == 1) { condition = (id_two >= id_one); }
else if (lower == 1) { condition = (id_two <= id_one); }
if (condition) {
// Copies the value into the destination matrix. This is always within bounds of the source
// matrix, as we know that the destination matrix is smaller than the source.
dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
if (id_two < dest_two && id_one < dest_one) {
dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
}
}
}
}

View File

@ -100,7 +100,8 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest) {
__global real* dest,
const int upper, const int lower) {
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@ -137,10 +138,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
// Stores the transposed value in the destination matrix
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
// Masking in case of triangular matrices: updates only the upper or lower part
bool condition = true;
if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
if (condition) {
// Stores the transposed value in the destination matrix
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
}
}
}
}

View File

@ -210,11 +210,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
const size_t dest_ld, const size_t dest_offset,
const Buffer &dest,
const bool do_transpose, const bool do_conjugate,
const bool pad, const Program &program) {
const bool pad, const bool upper, const bool lower,
const Program &program) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
(upper == false) && (lower == false);
// Determines the right kernel
auto kernel_name = std::string{};
@ -267,6 +269,10 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
if (pad) {
kernel.SetArgument(10, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(10, static_cast<int>(upper));
kernel.SetArgument(11, static_cast<int>(lower));
}
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on

View File

@ -108,18 +108,18 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// them up until they reach a certain multiple of size (kernel parameter dependent).
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
a_do_transpose, a_conjugate, true, program);
a_do_transpose, a_conjugate, true, false, false, program);
if (ErrorIn(status)) { return status; }
status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
b_do_transpose, b_conjugate, true, program);
b_do_transpose, b_conjugate, true, false, false, program);
if (ErrorIn(status)) { return status; }
// Only necessary for matrix C if it used both as input and output
if (beta != static_cast<T>(0)) {
status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
c_do_transpose, false, true, program);
c_do_transpose, false, true, false, false, program);
if (ErrorIn(status)) { return status; }
}
@ -151,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// Runs the post-processing kernel
status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
c_one, c_two, c_ld, c_offset, c_buffer,
c_do_transpose, false, false, program);
c_do_transpose, false, false, false, false, program);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation