Fixed an issue for unequal MWG and NWG and the new GEMMK == 1 kernel

2018-11-30 20:23:26 +01:00 · 2018-11-30 20:23:26 +01:00 · c0e41b87cb
parent bca1506e87
commit c0e41b87cb
2 changed files with 5 additions and 2 deletions
--- a/1
+++ b/1
@ -7,6 +7,7 @@ Development (next version)
 - Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY
 - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
 - Fixed an issue with the preprocessor and the new GEMMK == 1 kernel
+- Fixed an issue for unequal MWG and NWG and the new GEMMK == 1 kernel
 - Fixed an issue for certain parameters for AXPY's 'XaxpyFaster' kernel
 - Various minor fixes and enhancements
 - Added non-BLAS routines:
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@ -216,9 +216,11 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
  kernel.SetArgument(9, static_cast<int>(c_temp_offset / db_["VWM"]));

  // Computes the global and local thread sizes
+  const auto global_divider_one = c_want_rotated_(db_["GEMMK"]) ? db_["NWG"] : db_["MWG"];
+  const auto global_divider_two = c_want_rotated_(db_["GEMMK"]) ? db_["MWG"] : db_["NWG"];
  const auto global = std::vector<size_t>{
-    (c_one_i * db_["MDIMC"]) / db_["MWG"],
-    (c_two_i * db_["NDIMC"]) / db_["NWG"]
+    (c_one_i * db_["MDIMC"]) / global_divider_one,
+    (c_two_i * db_["NDIMC"]) / global_divider_two
  };
  const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};