Merge pull request #309 from CNugteren/CLBlast-306-omatcopy-conjugate
Fixes bug in conjugate transpose not being executedpull/310/head
commit
2bea758165
|
@ -2,6 +2,7 @@
|
|||
Development (next version)
|
||||
- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
|
||||
- The tuners now check beforehand on invalid local thread sizes and skip those completely
|
||||
- Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY
|
||||
- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
|
||||
- Various minor fixes and enhancements
|
||||
|
||||
|
|
|
@ -76,6 +76,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
|
|||
|
||||
// Determines the right kernel
|
||||
auto kernel_name = std::string{};
|
||||
auto pad_kernel = false;
|
||||
if (do_transpose) {
|
||||
if (use_fast_kernel &&
|
||||
IsMultiple(src_ld, db["TRA_WPT"]) &&
|
||||
|
@ -85,7 +86,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
|
|||
}
|
||||
else {
|
||||
use_fast_kernel = false;
|
||||
kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
|
||||
pad_kernel = (do_pad || do_conjugate);
|
||||
kernel_name = (pad_kernel) ? "TransposePadMatrix" : "TransposeMatrix";
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
@ -97,7 +99,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
|
|||
}
|
||||
else {
|
||||
use_fast_kernel = false;
|
||||
kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
|
||||
pad_kernel = do_pad;
|
||||
kernel_name = (pad_kernel) ? "CopyPadMatrix" : "CopyMatrix";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -123,7 +126,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
|
|||
kernel.SetArgument(8, static_cast<int>(dest_offset));
|
||||
kernel.SetArgument(9, dest());
|
||||
kernel.SetArgument(10, GetRealArg(alpha));
|
||||
if (do_pad) {
|
||||
if (pad_kernel) {
|
||||
kernel.SetArgument(11, static_cast<int>(do_conjugate));
|
||||
}
|
||||
else {
|
||||
|
|
|
@ -239,7 +239,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
|
|||
}
|
||||
|
||||
// Tests the error count (should be zero)
|
||||
TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
|
||||
TestErrorCount(errors, get_id1_(args)*get_id2_(args) + kCanarySize, args);
|
||||
}
|
||||
TestEnd();
|
||||
}
|
||||
|
|
|
@ -45,7 +45,9 @@ StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host)
|
|||
const auto b_two = (b_rotated) ? id1 : id2;
|
||||
const auto a_index = a_two * args.a_ld + a_one + args.a_offset;
|
||||
const auto b_index = b_two * args.b_ld + b_one + args.b_offset;
|
||||
buffers_host.b_mat[b_index] = args.alpha * buffers_host.a_mat[a_index];
|
||||
auto a_value = buffers_host.a_mat[a_index];
|
||||
if (args.a_transpose == Transpose::kConjugate) { a_value = ComplexConjugate(a_value); }
|
||||
buffers_host.b_mat[b_index] = args.alpha * a_value;
|
||||
}
|
||||
}
|
||||
return StatusCode::kSuccess;
|
||||
|
|
|
@ -31,6 +31,16 @@ template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
// Performs a complex conjugate if complex
|
||||
template <typename T> T ComplexConjugate(const T value) { return value; }
|
||||
template half ComplexConjugate(const half);
|
||||
template float ComplexConjugate(const float);
|
||||
template double ComplexConjugate(const double);
|
||||
template <> float2 ComplexConjugate(const float2 value) { return float2{value.real(), -value.imag()}; }
|
||||
template <> double2 ComplexConjugate(const double2 value) { return double2{value.real(), -value.imag()}; }
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
template <typename T, typename U>
|
||||
void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
|
||||
Queue &queue, const std::vector<std::string> &names) {
|
||||
|
|
|
@ -70,6 +70,10 @@ struct BuffersHost {
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
template <typename T> T ComplexConjugate(const T value);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
|
||||
// data-types such as the Layout and Transpose data-types.
|
||||
template <typename T>
|
||||
|
|
Loading…
Reference in New Issue