Merge pull request #277 from CNugteren/CLBlast-257-intel-subgroups

Intel subgroup shuffling
pull/282/head
Cedric Nugteren 2018-04-29 15:48:35 +02:00 committed by GitHub
commit b2248a17ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 49 additions and 12 deletions

View File

@ -4,6 +4,7 @@ Development (next version)
- Added CLBlast to Ubuntu PPA and macOS Homebrew package managers - Added CLBlast to Ubuntu PPA and macOS Homebrew package managers
- Added an API to run the tuners programmatically without any I/O - Added an API to run the tuners programmatically without any I/O
- Improved the performance potential by adding a second tunable GEMM kernel with 2D register tiling - Improved the performance potential by adding a second tunable GEMM kernel with 2D register tiling
- Added support for Intel specific subgroup shuffling extensions for faster GEMM on Intel GPUs
- Re-added a local memory size constraint to the tuners - Re-added a local memory size constraint to the tuners
- Updated and reorganised the CLBlast documentation - Updated and reorganised the CLBlast documentation
- Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program - Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program

View File

@ -17,8 +17,8 @@ This file gives an overview of the main features planned for addition to CLBlast
| [#233](https://github.com/CNugteren/CLBlast/issues/233) | Feb '18 | CNugteren | ✔ | Add CLBlast to common package managers | | [#233](https://github.com/CNugteren/CLBlast/issues/233) | Feb '18 | CNugteren | ✔ | Add CLBlast to common package managers |
| [#223](https://github.com/CNugteren/CLBlast/issues/223) | Feb '18 | CNugteren | ✔ | Python OpenCL interface | | [#223](https://github.com/CNugteren/CLBlast/issues/223) | Feb '18 | CNugteren | ✔ | Python OpenCL interface |
| [#237](https://github.com/CNugteren/CLBlast/issues/237) | Mar '18 | CNugteren | ✔ | Making tuning possible from the CLBlast API | | [#237](https://github.com/CNugteren/CLBlast/issues/237) | Mar '18 | CNugteren | ✔ | Making tuning possible from the CLBlast API |
| [#228](https://github.com/CNugteren/CLBlast/issues/228) | Mar-Apr '18 | CNugteren | | Improving performance for Qualcomm Adreno GPUs | | [#228](https://github.com/CNugteren/CLBlast/issues/228) | Mar-Apr '18 | CNugteren | | Improving performance for Qualcomm Adreno GPUs |
| [#270](https://github.com/CNugteren/CLBlast/issues/270) | Apr '18 | CNugteren | | Implement col2im | | [#270](https://github.com/CNugteren/CLBlast/issues/270) | May '18 | CNugteren | | Implement col2im |
| [#267](https://github.com/CNugteren/CLBlast/issues/267) | Apr-May '18 | CNugteren | | Merge im2col and GEMM into a direct kernel | | [#267](https://github.com/CNugteren/CLBlast/issues/267) | May '18 | CNugteren | | Merge im2col and GEMM into a direct kernel |
| [#136](https://github.com/CNugteren/CLBlast/issues/136) | May '18 | CNugteren | | Implement xAXPBY and xSET | | [#136](https://github.com/CNugteren/CLBlast/issues/136) | ?? | CNugteren | | Implement xAXPBY and xSET |
| [#169](https://github.com/CNugteren/CLBlast/issues/169) | ?? | dividiti | | Problem-specific tuning parameter selection | | [#169](https://github.com/CNugteren/CLBlast/issues/169) | ?? | dividiti | | Problem-specific tuning parameter selection |

View File

@ -114,6 +114,18 @@ R"(
#define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance #define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance
#endif #endif
// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
#ifndef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
#endif
#if USE_SUBGROUP_SHUFFLING == 1
#define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs
#endif
#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
#undef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold
#endif
// ================================================================================================= // =================================================================================================
// Data-widths in dimension M // Data-widths in dimension M

View File

@ -37,8 +37,13 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#pragma promote_to_registers #pragma promote_to_registers
realN bpm[NWI/VWN]; // 1 * NWI realN bpm[NWI/VWN]; // 1 * NWI
#elif GEMMK == 1 #elif GEMMK == 1
#pragma promote_to_registers #if USE_SUBGROUP_SHUFFLING == 1
realN apm[NWI*(KREG/VWN)]; // NWI * KREG #pragma promote_to_registers
realN apm[KREG/VWN]; // KREG (subgroup shuffling in NWI dimension)
#else
#pragma promote_to_registers
realN apm[NWI*(KREG/VWN)]; // NWI * KREG
#endif
#pragma promote_to_registers #pragma promote_to_registers
realM bpm[KREG*(MWI/VWM)]; // KREG * MWI realM bpm[KREG*(MWI/VWM)]; // KREG * MWI
#endif #endif
@ -123,14 +128,23 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#endif #endif
} }
#elif GEMMK == 1 #elif GEMMK == 1
// Loads data: 2D global --> 2D private (matrix A) // Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups
#pragma unroll #if USE_SUBGROUP_SHUFFLING == 1
for (int _ni = 0; _ni < NWI; _ni += 1) { const int _ni = get_sub_group_local_id();
#pragma unroll #pragma unroll
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
apm[_ni * (KREG/VWN) + _ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki); apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
} }
} // Loads data: 2D global --> 2D private (matrix A)
#else
#pragma unroll
for (int _ni = 0; _ni < NWI; _ni += 1) {
#pragma unroll
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
apm[_ni * (KREG/VWN) + _ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
}
}
#endif
#endif #endif
// Performs the accumulation (Cpm += Apm * Bpm) // Performs the accumulation (Cpm += Apm * Bpm)
@ -187,7 +201,11 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#pragma unroll #pragma unroll
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
const int index = _ni * (MWI/VWM) + _mi; const int index = _ni * (MWI/VWM) + _mi;
const realN aval = apm[_ni * (KREG/VWN) + _ki]; #if USE_SUBGROUP_SHUFFLING == 1
const realN aval = intel_sub_group_shuffle(apm[_ki], _ni);
#else
const realN aval = apm[_ni * (KREG/VWN) + _ki];
#endif
#if VWN == 1 #if VWN == 1
cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval); cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval);
#elif VWN == 2 #elif VWN == 2

View File

@ -57,6 +57,11 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
header_string += "#define GLOBAL_MEM_FENCE 1\n"; header_string += "#define GLOBAL_MEM_FENCE 1\n";
} }
// For Intel GPUs with subgroup support, use subgroup shuffling.
if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
}
// Optionally adds a translation header from OpenCL kernels to CUDA kernels // Optionally adds a translation header from OpenCL kernels to CUDA kernels
#ifdef CUDA_API #ifdef CUDA_API
header_string += header_string +=

View File

@ -47,6 +47,7 @@ using double2 = std::complex<double>;
// Khronos OpenCL extensions // Khronos OpenCL extensions
const std::string kKhronosAttributesAMD = "cl_amd_device_attribute_query"; const std::string kKhronosAttributesAMD = "cl_amd_device_attribute_query";
const std::string kKhronosAttributesNVIDIA = "cl_nv_device_attribute_query"; const std::string kKhronosAttributesNVIDIA = "cl_nv_device_attribute_query";
const std::string kKhronosIntelSubgroups = "cl_intel_subgroups";
// Catched an unknown error // Catched an unknown error
constexpr auto kUnknownError = -999; constexpr auto kUnknownError = -999;