Merge pull request #304 from CNugteren/CLBlast-300-fix-staggered-indices-AMD-GEMMK1

Fix staggered indices on AMD GPUs for GEMMK == 1 kernel
pull/305/head
Cedric Nugteren 2018-07-28 15:29:16 +02:00 committed by GitHub
commit dda1e567f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 18 additions and 4 deletions

View File

@ -1,4 +1,9 @@
Development (next version)
- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
- Various minor fixes and enhancements
Version 1.4.1
- Fixed an access violation under Windows upon releasing the OpenCL program when the driver is already unloaded
- Fixed an issue with double cl_program release in the CLBlast caching system

View File

@ -260,7 +260,7 @@ R"(
// Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from:
// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
// More details: https://github.com/CNugteren/CLBlast/issues/53
#if USE_STAGGERED_INDICES == 1
#if USE_STAGGERED_INDICES == 1 && GEMMK == 0
INLINE_FUNC int GetGroupIDFlat() {
return get_group_id(0) + get_num_groups(0) * get_group_id(1);
}

View File

@ -91,8 +91,8 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#if GEMMK == 1
const __global real* restrict a_ptr = (const __global real* restrict) &agm[0];
const __global real* restrict b_ptr = (const __global real* restrict) &bgm[0];
const int tid_x = get_global_id(0);
const int tid_y = get_global_id(1);
const int tid_x = get_local_id(0) + MDIMC * GetGroupID0();
const int tid_y = get_local_id(1) + NDIMC * GetGroupID1();
#endif
// Combined thread identifier (volatile to disable caching)

View File

@ -342,8 +342,17 @@ void Tuner(int argc, char* argv[], const int V,
const auto best_time_ms = best_configuration->score;
if (best_time_ms == 0.0) { return; }
// Also prints the performance of the best-case in terms of GB/s or GFLOPS
// Computes and prints some other statistics
auto average_ms = 0.0;
for (const auto result : results) { average_ms += result.score; }
average_ms /= results.size();
printf("\n");
printf("* Got average result of %.2lf ms", average_ms);
printf(": %.1lf %s\n", settings.metric_amount / (average_ms * 1.0e6),
settings.performance_unit.c_str());
// Also prints the performance of the best-case in terms of GB/s or GFLOPS
printf("* Found best result %.2lf ms", best_time_ms);
printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6),
settings.performance_unit.c_str());