Added a condition to update only lower/upper triangular parts in the un-pad kernels
parent
4c2a166bc5
commit
20eb3506d6
|
@ -92,7 +92,8 @@ class Routine {
|
|||
const size_t dest_ld, const size_t dest_offset,
|
||||
const Buffer &dest,
|
||||
const bool do_transpose, const bool do_conjugate,
|
||||
const bool pad, const Program &program);
|
||||
const bool pad, const bool upper, const bool lower,
|
||||
const Program &program);
|
||||
|
||||
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
|
||||
// The first assumes that the program is available in the cache and will throw an exception
|
||||
|
|
|
@ -86,7 +86,8 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
|
|||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
__global real* dest,
|
||||
const int upper, const int lower) {
|
||||
|
||||
// Loops over the work per thread in both dimensions
|
||||
#pragma unroll
|
||||
|
@ -95,11 +96,18 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
|
|||
#pragma unroll
|
||||
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
|
||||
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
|
||||
if (id_two < dest_two && id_one < dest_one) {
|
||||
|
||||
// Masking in case of triangular matrices: updates only the upper or lower part
|
||||
bool condition = true;
|
||||
if (upper == 1) { condition = (id_two >= id_one); }
|
||||
else if (lower == 1) { condition = (id_two <= id_one); }
|
||||
if (condition) {
|
||||
|
||||
// Copies the value into the destination matrix. This is always within bounds of the source
|
||||
// matrix, as we know that the destination matrix is smaller than the source.
|
||||
dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
|
||||
if (id_two < dest_two && id_one < dest_one) {
|
||||
dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -100,7 +100,8 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
|
|||
__global const real* restrict src,
|
||||
const int dest_one, const int dest_two,
|
||||
const int dest_ld, const int dest_offset,
|
||||
__global real* dest) {
|
||||
__global real* dest,
|
||||
const int upper, const int lower) {
|
||||
|
||||
// Local memory to store a tile of the matrix (for coalescing)
|
||||
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
|
||||
|
@ -137,10 +138,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
|
|||
const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
|
||||
const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
|
||||
|
||||
// Stores the transposed value in the destination matrix
|
||||
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
|
||||
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
|
||||
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
|
||||
// Masking in case of triangular matrices: updates only the upper or lower part
|
||||
bool condition = true;
|
||||
if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
|
||||
else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
|
||||
if (condition) {
|
||||
|
||||
// Stores the transposed value in the destination matrix
|
||||
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
|
||||
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
|
||||
dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -210,11 +210,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
|
|||
const size_t dest_ld, const size_t dest_offset,
|
||||
const Buffer &dest,
|
||||
const bool do_transpose, const bool do_conjugate,
|
||||
const bool pad, const Program &program) {
|
||||
const bool pad, const bool upper, const bool lower,
|
||||
const Program &program) {
|
||||
|
||||
// Determines whether or not the fast-version could potentially be used
|
||||
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
|
||||
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
|
||||
(src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
|
||||
(upper == false) && (lower == false);
|
||||
|
||||
// Determines the right kernel
|
||||
auto kernel_name = std::string{};
|
||||
|
@ -267,6 +269,10 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
|
|||
if (pad) {
|
||||
kernel.SetArgument(10, static_cast<int>(do_conjugate));
|
||||
}
|
||||
else {
|
||||
kernel.SetArgument(10, static_cast<int>(upper));
|
||||
kernel.SetArgument(11, static_cast<int>(lower));
|
||||
}
|
||||
}
|
||||
|
||||
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
|
||||
|
|
|
@ -108,18 +108,18 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
// them up until they reach a certain multiple of size (kernel parameter dependent).
|
||||
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
|
||||
m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
|
||||
a_do_transpose, a_conjugate, true, program);
|
||||
a_do_transpose, a_conjugate, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
|
||||
n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
|
||||
b_do_transpose, b_conjugate, true, program);
|
||||
b_do_transpose, b_conjugate, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Only necessary for matrix C if it used both as input and output
|
||||
if (beta != static_cast<T>(0)) {
|
||||
status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
|
||||
m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
|
||||
c_do_transpose, false, true, program);
|
||||
c_do_transpose, false, true, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
}
|
||||
|
||||
|
@ -151,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
// Runs the post-processing kernel
|
||||
status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
|
||||
c_one, c_two, c_ld, c_offset, c_buffer,
|
||||
c_do_transpose, false, false, program);
|
||||
c_do_transpose, false, false, false, false, program);
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Successfully finished the computation
|
||||
|
|
Loading…
Reference in New Issue