Merge pull request #309 from CNugteren/CLBlast-306-omatcopy-conjugate

Fixes bug in conjugate transpose not being executed
pull/310/head
Cedric Nugteren 2018-08-02 08:35:32 +02:00 committed by GitHub
commit 2bea758165
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 25 additions and 5 deletions

View File

@ -2,6 +2,7 @@
Development (next version)
- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
- The tuners now check beforehand on invalid local thread sizes and skip those completely
- Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY
- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
- Various minor fixes and enhancements

View File

@ -76,6 +76,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
// Determines the right kernel
auto kernel_name = std::string{};
auto pad_kernel = false;
if (do_transpose) {
if (use_fast_kernel &&
IsMultiple(src_ld, db["TRA_WPT"]) &&
@ -85,7 +86,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
else {
use_fast_kernel = false;
kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
pad_kernel = (do_pad || do_conjugate);
kernel_name = (pad_kernel) ? "TransposePadMatrix" : "TransposeMatrix";
}
}
else {
@ -97,7 +99,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
else {
use_fast_kernel = false;
kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
pad_kernel = do_pad;
kernel_name = (pad_kernel) ? "CopyPadMatrix" : "CopyMatrix";
}
}
@ -123,7 +126,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, GetRealArg(alpha));
if (do_pad) {
if (pad_kernel) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {

View File

@ -239,7 +239,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
}
// Tests the error count (should be zero)
TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
TestErrorCount(errors, get_id1_(args)*get_id2_(args) + kCanarySize, args);
}
TestEnd();
}

View File

@ -45,7 +45,9 @@ StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host)
const auto b_two = (b_rotated) ? id1 : id2;
const auto a_index = a_two * args.a_ld + a_one + args.a_offset;
const auto b_index = b_two * args.b_ld + b_one + args.b_offset;
buffers_host.b_mat[b_index] = args.alpha * buffers_host.a_mat[a_index];
auto a_value = buffers_host.a_mat[a_index];
if (args.a_transpose == Transpose::kConjugate) { a_value = ComplexConjugate(a_value); }
buffers_host.b_mat[b_index] = args.alpha * a_value;
}
}
return StatusCode::kSuccess;

View File

@ -31,6 +31,16 @@ template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value
// =================================================================================================
// Performs a complex conjugate if complex
template <typename T> T ComplexConjugate(const T value) { return value; }
template half ComplexConjugate(const half);
template float ComplexConjugate(const float);
template double ComplexConjugate(const double);
template <> float2 ComplexConjugate(const float2 value) { return float2{value.real(), -value.imag()}; }
template <> double2 ComplexConjugate(const double2 value) { return double2{value.real(), -value.imag()}; }
// =================================================================================================
template <typename T, typename U>
void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
Queue &queue, const std::vector<std::string> &names) {

View File

@ -70,6 +70,10 @@ struct BuffersHost {
// =================================================================================================
template <typename T> T ComplexConjugate(const T value);
// =================================================================================================
// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
// data-types such as the Layout and Transpose data-types.
template <typename T>