Merge pull request #308 from CNugteren/CLBlast-301-weird-AMD-Hainan-bug

Added workaround for AMD Southern Islands GPU issue
2018-07-31 21:49:53 +02:00 · 2018-07-31 21:49:53 +02:00 · bed10d2731
parent d749c4af72 2b76bfee97
commit bed10d2731
2 changed files with 35 additions and 3 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -35,6 +35,12 @@ option(TESTS "Enable compilation of the correctness tests" OFF)
 option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
 option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF)

+# Workarounds for bugs
+option(AMD_SI_EMPTY_KERNEL_WORKAROUND "Enables workaround for bug in AMD Southern Island GPUs" OFF)
+if(AMD_SI_EMPTY_KERNEL_WORKAROUND)
+  add_definitions(-DAMD_SI_EMPTY_KERNEL_WORKAROUND)
+endif()
+
 # Select between an OpenCL API (default) or a CUDA API (beta)
 option(OPENCL "Build CLBlast with an OpenCL API (default)" ON)
 option(CUDA "Build CLBlast with a CUDA API (beta)" OFF)
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@ -447,8 +447,14 @@ class Program {

  // Source-based constructor with memory management
  explicit Program(const Context &context, const std::string &source) {
-    const char *source_ptr = &source[0];
-    const auto length = source.length();
+    #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+      const std::string source_null_kernel = source + "\n__kernel void null_kernel() {}\n";
+      const char *source_ptr = &source_null_kernel[0];
+      const auto length = source_null_kernel.length();
+    #else
+      const char *source_ptr = &source[0];
+      const auto length = source.length();
+    #endif
    auto status = CL_SUCCESS;
    program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
    CLCudaAPIError::Check(status, "clCreateProgramWithSource");
@ -764,10 +770,21 @@ class Kernel {
      kernel_(new cl_kernel, [](cl_kernel* k) {
        if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
        delete k;
-      }) {
+      })
+    #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+      , null_kernel_(new cl_kernel, [](cl_kernel* k) {
+        if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
+        delete k;
+      })
+    #endif
+  {
    auto status = CL_SUCCESS;
    *kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status);
    CLCudaAPIError::Check(status, "clCreateKernel");
+    #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+      *null_kernel_ = clCreateKernel(program->operator()(), "null_kernel", &status);
+      CLCudaAPIError::Check(status, "clCreateKernel");
+    #endif
  }

  // Sets a kernel argument at the indicated position
@ -831,12 +848,21 @@ class Kernel {
                                      static_cast<cl_uint>(waitForEventsPlain.size()),
                                      !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
                                      event));
+    #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+      const std::vector<size_t> nullRange = {1};
+      CheckError(clEnqueueNDRangeKernel(queue(), *null_kernel_, static_cast<cl_uint>(nullRange.size()),
+                                        nullptr, nullRange.data(), nullptr,
+                                        0, nullptr, nullptr));
+    #endif
  }

  // Accessor to the private data-member
  const cl_kernel& operator()() const { return *kernel_; }
 private:
  std::shared_ptr<cl_kernel> kernel_;
+  #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND
+    std::shared_ptr<cl_kernel> null_kernel_;
+  #endif

  // Internal implementation for the recursive SetArguments function.
  template <typename T>