Made the pre-processor run by default for ARM and Qualcomm GPUs

2017-12-09 15:16:53 +01:00 · 2017-12-09 15:16:53 +01:00 · ca5dbcd2bd
parent 02c0d64037
commit ca5dbcd2bd
8 changed files with 20 additions and 10 deletions
--- a/2
+++ b/2
@ -2,6 +2,8 @@
 Development (next version)
 - Re-designed and integrated the auto-tuner, no more dependency on CLTune
 - Made it possible to override the tuning parameters in the clients straight from JSON tuning files
+- Added OpenCL pre-processor to unroll loops and perform array-to-register promotions for compilers
+  which don't this themselves (ARM, Qualcomm) - greatly improves performance on these platforms
 - Added tuned parameters for various devices (see README)

 Version 1.2.0
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@ -333,6 +333,7 @@ class Device {
                                Vendor() == "GenuineIntel" ||
                                Vendor() == "Intel(R) Corporation"; }
  bool IsARM() const { return Vendor() == "ARM"; }
+  bool IsQualcomm() const { return Vendor() == "QUALCOMM"; }

  // Platform specific extensions
  std::string AMDBoardName() const { // check for 'cl_amd_device_attribute_query' first
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@ -321,6 +321,7 @@ public:
  bool IsNVIDIA() const { return true; }
  bool IsIntel() const { return false; }
  bool IsARM() const { return false; }
+  bool IsQualcomm() const { return false; }

  // Platform specific extensions
  std::string AMDBoardName() const { return ""; }
--- a/src/routine.cpp
+++ b/src/routine.cpp
@ -148,7 +148,7 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {

  // Completes the source and compiles the kernel
  program_ = CompileFromSource(source_string, precision_, routine_name_,
-                               device_, context_, options, false);
+                               device_, context_, options, 0);


  // Store the compiled binary and program in the cache
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@ -227,7 +227,7 @@ void Tuner(int argc, char* argv[]) {
    // Compiles the kernel
    auto compiler_options = std::vector<std::string>();
    const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name,
-                                           device, context, compiler_options, false);
+                                           device, context, compiler_options, 0);
    auto kernel = Kernel(program, settings.kernel_name);
    C::SetArguments(kernel, args, device_buffers);
    printf("             %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str());
@ -286,7 +286,7 @@ void Tuner(int argc, char* argv[]) {
      const auto start_time = std::chrono::steady_clock::now();
      auto compiler_options = std::vector<std::string>();
      const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name,
-                                             device, context, compiler_options, false, true);
+                                             device, context, compiler_options, 0, true);
      auto kernel = Kernel(program, settings.kernel_name);
      const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
      const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@ -25,7 +25,8 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
                          const std::string &routine_name,
                          const Device& device, const Context& context,
                          std::vector<std::string>& options,
-                          const bool run_preprocessor, const bool silent) {
+                          const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never
+                          const bool silent) {
  auto header_string = std::string{""};

  header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
@ -75,9 +76,14 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
    const auto start_time = std::chrono::steady_clock::now();
  #endif

-  // Runs a pre-processor to unroll loops and perform array-to-register promotion
+  // Runs a pre-processor to unroll loops and perform array-to-register promotion. Most OpenCL
+  // compilers do this, but some don't.
+  auto do_run_preprocessor = false;
+  if (run_preprocessor == 0) { do_run_preprocessor = (device.IsARM() && device.IsGPU()) ||
+                                                     (device.IsQualcomm() && device.IsGPU()); }
+  if (run_preprocessor == 1) { do_run_preprocessor = true; }
  auto kernel_string = header_string + source_string;
-  if (run_preprocessor) {
+  if (do_run_preprocessor) {
    log_debug("Running built-in pre-processor");
    kernel_string = PreprocessKernelSource(kernel_string);
  }
--- a/src/utilities/compile.hpp
+++ b/src/utilities/compile.hpp
@ -28,7 +28,7 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
                          const std::string &routine_name,
                          const Device& device, const Context& context,
                          std::vector<std::string>& options,
-                          const bool run_preprocessor,
+                          const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never
                          const bool silent = false);

 // =================================================================================================
--- a/test/correctness/misc/preprocessor.cpp
+++ b/test/correctness/misc/preprocessor.cpp
@ -120,13 +120,13 @@ bool TestKernel(const Device& device, const Context& context,
  // Verifies that the current kernel compiles properly (assumes so, otherwise throws an error)
  auto compiler_options_ref = std::vector<std::string>();
  const auto program_ref = CompileFromSource(kernel_source, precision, kernel_name,
-                                             device, context, compiler_options_ref, false);
+                                             device, context, compiler_options_ref, 2);

  // Compiles the same kernel, but now with the pre-processor enabled
  try {
    auto compiler_options = std::vector<std::string>();
    const auto program = CompileFromSource(kernel_source, precision, kernel_name,
-                                           device, context, compiler_options, true);
+                                           device, context, compiler_options, 1);
    return true;
  } catch (const CLCudaAPIBuildError &e) {
    fprintf(stdout, "* ERROR: Compilation warnings/errors with pre-processed kernel, status %d\n",
@ -219,7 +219,7 @@ size_t RunPreprocessor(int argc, char *argv[], const bool silent, const Precisio
    #include "../src/kernels/level3/level3.opencl"
    #include "../src/kernels/level3/transpose_pad.opencl"
  ;
-  //if (TestKernel(device, context, "TransposePadMatrix", transpose_pad_sources, precision)) { passed++; } else { errors++; }
+  if (TestKernel(device, context, "TransposePadMatrix", transpose_pad_sources, precision)) { passed++; } else { errors++; }

  // GEMM (in-direct)
  const auto gemm_sources =