Merge pull request #277 from CNugteren/CLBlast-257-intel-subgroups

Intel subgroup shuffling
2018-04-29 15:48:35 +02:00 · 2018-04-29 15:48:35 +02:00 · b2248a17ae
parent 7b416c8686 9f22bc232b
commit b2248a17ae
6 changed files with 49 additions and 12 deletions
--- a/1
+++ b/1
@ -4,6 +4,7 @@ Development (next version)
 - Added CLBlast to Ubuntu PPA and macOS Homebrew package managers
 - Added an API to run the tuners programmatically without any I/O
 - Improved the performance potential by adding a second tunable GEMM kernel with 2D register tiling
+- Added support for Intel specific subgroup shuffling extensions for faster GEMM on Intel GPUs
 - Re-added a local memory size constraint to the tuners
 - Updated and reorganised the CLBlast documentation
 - Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program
--- a/ROADMAP.md
+++ b/ROADMAP.md
@ -17,8 +17,8 @@ This file gives an overview of the main features planned for addition to CLBlast
 | [#233](https://github.com/CNugteren/CLBlast/issues/233)        | Feb '18     | CNugteren | ✔      | Add CLBlast to common package managers |
 | [#223](https://github.com/CNugteren/CLBlast/issues/223)        | Feb '18     | CNugteren | ✔      | Python OpenCL interface |
 | [#237](https://github.com/CNugteren/CLBlast/issues/237)        | Mar '18     | CNugteren | ✔      | Making tuning possible from the CLBlast API |
-| [#228](https://github.com/CNugteren/CLBlast/issues/228)        | Mar-Apr '18 | CNugteren |        | Improving performance for Qualcomm Adreno GPUs |
-| [#270](https://github.com/CNugteren/CLBlast/issues/270)        | Apr '18     | CNugteren |        | Implement col2im |
-| [#267](https://github.com/CNugteren/CLBlast/issues/267)        | Apr-May '18 | CNugteren |        | Merge im2col and GEMM into a direct kernel |
-| [#136](https://github.com/CNugteren/CLBlast/issues/136)        | May '18     | CNugteren |        | Implement xAXPBY and xSET |
+| [#228](https://github.com/CNugteren/CLBlast/issues/228)        | Mar-Apr '18 | CNugteren | ✔      | Improving performance for Qualcomm Adreno GPUs |
+| [#270](https://github.com/CNugteren/CLBlast/issues/270)        | May '18     | CNugteren |        | Implement col2im |
+| [#267](https://github.com/CNugteren/CLBlast/issues/267)        | May '18     | CNugteren |        | Merge im2col and GEMM into a direct kernel |
+| [#136](https://github.com/CNugteren/CLBlast/issues/136)        | ??          | CNugteren |        | Implement xAXPBY and xSET |
 | [#169](https://github.com/CNugteren/CLBlast/issues/169)        | ??          | dividiti  |        | Problem-specific tuning parameter selection |
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@ -114,6 +114,18 @@ R"(
  #define GLOBAL_MEM_FENCE 0    // Global synchronisation barrier for potential better performance
 #endif

+// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
+#ifndef USE_SUBGROUP_SHUFFLING
+  #define USE_SUBGROUP_SHUFFLING 0     // Optionally enables subgroup shuffling for Intel GPUs
+#endif
+#if USE_SUBGROUP_SHUFFLING == 1
+  #define SUBGROUP_SIZE 8              // Assumes subgroup size is always 8 on Intel GPUs
+#endif
+#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
+  #undef USE_SUBGROUP_SHUFFLING
+  #define USE_SUBGROUP_SHUFFLING 0     // Disables subgroups in case the assumptions don't hold
+#endif
+
 // =================================================================================================

 // Data-widths in dimension M
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@ -37,8 +37,13 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
    #pragma promote_to_registers
    realN bpm[NWI/VWN]; // 1 * NWI
  #elif GEMMK == 1
-    #pragma promote_to_registers
-    realN apm[NWI*(KREG/VWN)]; // NWI * KREG
+    #if USE_SUBGROUP_SHUFFLING == 1
+      #pragma promote_to_registers
+      realN apm[KREG/VWN]; // KREG (subgroup shuffling in NWI dimension)
+    #else
+      #pragma promote_to_registers
+      realN apm[NWI*(KREG/VWN)]; // NWI * KREG
+    #endif
    #pragma promote_to_registers
    realM bpm[KREG*(MWI/VWM)]; // KREG * MWI
  #endif
@ -123,14 +128,23 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
            #endif
          }
        #elif GEMMK == 1
-          // Loads data: 2D global --> 2D private (matrix A)
-          #pragma unroll
-          for (int _ni = 0; _ni < NWI; _ni += 1) {
+          // Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups
+          #if USE_SUBGROUP_SHUFFLING == 1
+            const int _ni = get_sub_group_local_id();
            #pragma unroll
            for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
-              apm[_ni * (KREG/VWN) + _ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
+              apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
            }
-          }
+          // Loads data: 2D global --> 2D private (matrix A)
+          #else
+            #pragma unroll
+            for (int _ni = 0; _ni < NWI; _ni += 1) {
+              #pragma unroll
+              for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
+                apm[_ni * (KREG/VWN) + _ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
+              }
+            }
+          #endif
        #endif

        // Performs the accumulation (Cpm += Apm * Bpm)
@ -187,7 +201,11 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
              #pragma unroll
              for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
                const int index =  _ni * (MWI/VWM) + _mi;
-                const realN aval = apm[_ni * (KREG/VWN) + _ki];
+                #if USE_SUBGROUP_SHUFFLING == 1
+                  const realN aval = intel_sub_group_shuffle(apm[_ki], _ni);
+                #else
+                  const realN aval = apm[_ni * (KREG/VWN) + _ki];
+                #endif
                #if VWN == 1
                  cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval);
                #elif VWN == 2
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@ -57,6 +57,11 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
    header_string += "#define GLOBAL_MEM_FENCE 1\n";
  }

+  // For Intel GPUs with subgroup support, use subgroup shuffling.
+  if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
+    header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
+  }
+
  // Optionally adds a translation header from OpenCL kernels to CUDA kernels
  #ifdef CUDA_API
    header_string +=
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@ -47,6 +47,7 @@ using double2 = std::complex<double>;
 // Khronos OpenCL extensions
 const std::string kKhronosAttributesAMD = "cl_amd_device_attribute_query";
 const std::string kKhronosAttributesNVIDIA = "cl_nv_device_attribute_query";
+const std::string kKhronosIntelSubgroups = "cl_intel_subgroups";

 // Catched an unknown error
 constexpr auto kUnknownError = -999;