Merge pull request #304 from CNugteren/CLBlast-300-fix-staggered-indices-AMD-GEMMK1
Fix staggered indices on AMD GPUs for GEMMK == 1 kernelpull/305/head
commit
dda1e567f8
|
@ -1,4 +1,9 @@
|
|||
|
||||
Development (next version)
|
||||
- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
|
||||
- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
|
||||
- Various minor fixes and enhancements
|
||||
|
||||
Version 1.4.1
|
||||
- Fixed an access violation under Windows upon releasing the OpenCL program when the driver is already unloaded
|
||||
- Fixed an issue with double cl_program release in the CLBlast caching system
|
||||
|
|
|
@ -260,7 +260,7 @@ R"(
|
|||
// Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from:
|
||||
// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
|
||||
// More details: https://github.com/CNugteren/CLBlast/issues/53
|
||||
#if USE_STAGGERED_INDICES == 1
|
||||
#if USE_STAGGERED_INDICES == 1 && GEMMK == 0
|
||||
INLINE_FUNC int GetGroupIDFlat() {
|
||||
return get_group_id(0) + get_num_groups(0) * get_group_id(1);
|
||||
}
|
||||
|
|
|
@ -91,8 +91,8 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
#if GEMMK == 1
|
||||
const __global real* restrict a_ptr = (const __global real* restrict) &agm[0];
|
||||
const __global real* restrict b_ptr = (const __global real* restrict) &bgm[0];
|
||||
const int tid_x = get_global_id(0);
|
||||
const int tid_y = get_global_id(1);
|
||||
const int tid_x = get_local_id(0) + MDIMC * GetGroupID0();
|
||||
const int tid_y = get_local_id(1) + NDIMC * GetGroupID1();
|
||||
#endif
|
||||
|
||||
// Combined thread identifier (volatile to disable caching)
|
||||
|
|
|
@ -342,8 +342,17 @@ void Tuner(int argc, char* argv[], const int V,
|
|||
const auto best_time_ms = best_configuration->score;
|
||||
if (best_time_ms == 0.0) { return; }
|
||||
|
||||
// Also prints the performance of the best-case in terms of GB/s or GFLOPS
|
||||
// Computes and prints some other statistics
|
||||
auto average_ms = 0.0;
|
||||
for (const auto result : results) { average_ms += result.score; }
|
||||
average_ms /= results.size();
|
||||
printf("\n");
|
||||
printf("* Got average result of %.2lf ms", average_ms);
|
||||
printf(": %.1lf %s\n", settings.metric_amount / (average_ms * 1.0e6),
|
||||
settings.performance_unit.c_str());
|
||||
|
||||
|
||||
// Also prints the performance of the best-case in terms of GB/s or GFLOPS
|
||||
printf("* Found best result %.2lf ms", best_time_ms);
|
||||
printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6),
|
||||
settings.performance_unit.c_str());
|
||||
|
|
Loading…
Reference in New Issue