diff --git a/CHANGELOG b/CHANGELOG index 4a17a47b..bcb76339 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,7 @@ Development (next version) - Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel - Fixed an issue with the preprocessor and the new GEMMK == 1 kernel +- Fixed an issue for unequal MWG and NWG and the new GEMMK == 1 kernel - Fixed an issue for certain parameters for AXPY's 'XaxpyFaster' kernel - Various minor fixes and enhancements - Added non-BLAS routines: diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index cb24460a..6daa0fcf 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -216,9 +216,11 @@ void Xgemm::GemmIndirect(const size_t m, const size_t n, const size_t k, kernel.SetArgument(9, static_cast(c_temp_offset / db_["VWM"])); // Computes the global and local thread sizes + const auto global_divider_one = c_want_rotated_(db_["GEMMK"]) ? db_["NWG"] : db_["MWG"]; + const auto global_divider_two = c_want_rotated_(db_["GEMMK"]) ? db_["MWG"] : db_["NWG"]; const auto global = std::vector{ - (c_one_i * db_["MDIMC"]) / db_["MWG"], - (c_two_i * db_["NDIMC"]) / db_["NWG"] + (c_one_i * db_["MDIMC"]) / global_divider_one, + (c_two_i * db_["NDIMC"]) / global_divider_two }; const auto local = std::vector{db_["MDIMC"], db_["NDIMC"]};