diff --git a/CHANGELOG b/CHANGELOG index 621fa9b9..5f3ef371 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ Development (next version) - Added CLBlast to Ubuntu PPA and macOS Homebrew package managers - Added an API to run the tuners programmatically without any I/O - Improved the performance potential by adding a second tunable GEMM kernel with 2D register tiling +- Added support for Intel specific subgroup shuffling extensions for faster GEMM on Intel GPUs - Re-added a local memory size constraint to the tuners - Updated and reorganised the CLBlast documentation - Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program diff --git a/ROADMAP.md b/ROADMAP.md index 4cb003eb..3be62501 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -17,8 +17,8 @@ This file gives an overview of the main features planned for addition to CLBlast | [#233](https://github.com/CNugteren/CLBlast/issues/233) | Feb '18 | CNugteren | ✔ | Add CLBlast to common package managers | | [#223](https://github.com/CNugteren/CLBlast/issues/223) | Feb '18 | CNugteren | ✔ | Python OpenCL interface | | [#237](https://github.com/CNugteren/CLBlast/issues/237) | Mar '18 | CNugteren | ✔ | Making tuning possible from the CLBlast API | -| [#228](https://github.com/CNugteren/CLBlast/issues/228) | Mar-Apr '18 | CNugteren | | Improving performance for Qualcomm Adreno GPUs | -| [#270](https://github.com/CNugteren/CLBlast/issues/270) | Apr '18 | CNugteren | | Implement col2im | -| [#267](https://github.com/CNugteren/CLBlast/issues/267) | Apr-May '18 | CNugteren | | Merge im2col and GEMM into a direct kernel | -| [#136](https://github.com/CNugteren/CLBlast/issues/136) | May '18 | CNugteren | | Implement xAXPBY and xSET | +| [#228](https://github.com/CNugteren/CLBlast/issues/228) | Mar-Apr '18 | CNugteren | ✔ | Improving performance for Qualcomm Adreno GPUs | +| [#270](https://github.com/CNugteren/CLBlast/issues/270) | May '18 | CNugteren | | Implement col2im | +| [#267](https://github.com/CNugteren/CLBlast/issues/267) | May '18 | CNugteren | | Merge im2col and GEMM into a direct kernel | +| [#136](https://github.com/CNugteren/CLBlast/issues/136) | ?? | CNugteren | | Implement xAXPBY and xSET | | [#169](https://github.com/CNugteren/CLBlast/issues/169) | ?? | dividiti | | Problem-specific tuning parameter selection | diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index d15dafc8..99d64c91 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -114,6 +114,18 @@ R"( #define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance #endif +// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) +#ifndef USE_SUBGROUP_SHUFFLING + #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs +#endif +#if USE_SUBGROUP_SHUFFLING == 1 + #define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs +#endif +#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE + #undef USE_SUBGROUP_SHUFFLING + #define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold +#endif + // ================================================================================================= // Data-widths in dimension M diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index c25c3001..c3920cb5 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -37,8 +37,13 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #pragma promote_to_registers realN bpm[NWI/VWN]; // 1 * NWI #elif GEMMK == 1 - #pragma promote_to_registers - realN apm[NWI*(KREG/VWN)]; // NWI * KREG + #if USE_SUBGROUP_SHUFFLING == 1 + #pragma promote_to_registers + realN apm[KREG/VWN]; // KREG (subgroup shuffling in NWI dimension) + #else + #pragma promote_to_registers + realN apm[NWI*(KREG/VWN)]; // NWI * KREG + #endif #pragma promote_to_registers realM bpm[KREG*(MWI/VWM)]; // KREG * MWI #endif @@ -123,14 +128,23 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #endif } #elif GEMMK == 1 - // Loads data: 2D global --> 2D private (matrix A) - #pragma unroll - for (int _ni = 0; _ni < NWI; _ni += 1) { + // Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups + #if USE_SUBGROUP_SHUFFLING == 1 + const int _ni = get_sub_group_local_id(); #pragma unroll for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { - apm[_ni * (KREG/VWN) + _ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki); + apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki); } - } + // Loads data: 2D global --> 2D private (matrix A) + #else + #pragma unroll + for (int _ni = 0; _ni < NWI; _ni += 1) { + #pragma unroll + for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { + apm[_ni * (KREG/VWN) + _ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki); + } + } + #endif #endif // Performs the accumulation (Cpm += Apm * Bpm) @@ -187,7 +201,11 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #pragma unroll for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { const int index = _ni * (MWI/VWM) + _mi; - const realN aval = apm[_ni * (KREG/VWN) + _ki]; + #if USE_SUBGROUP_SHUFFLING == 1 + const realN aval = intel_sub_group_shuffle(apm[_ki], _ni); + #else + const realN aval = apm[_ni * (KREG/VWN) + _ki]; + #endif #if VWN == 1 cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval); #elif VWN == 2 diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index c1d921a4..65131cca 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -57,6 +57,11 @@ Program CompileFromSource(const std::string &source_string, const Precision prec header_string += "#define GLOBAL_MEM_FENCE 1\n"; } + // For Intel GPUs with subgroup support, use subgroup shuffling. + if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) { + header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; + } + // Optionally adds a translation header from OpenCL kernels to CUDA kernels #ifdef CUDA_API header_string += diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index d382b331..0edf77fe 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -47,6 +47,7 @@ using double2 = std::complex; // Khronos OpenCL extensions const std::string kKhronosAttributesAMD = "cl_amd_device_attribute_query"; const std::string kKhronosAttributesNVIDIA = "cl_nv_device_attribute_query"; +const std::string kKhronosIntelSubgroups = "cl_intel_subgroups"; // Catched an unknown error constexpr auto kUnknownError = -999;