Merge branch 'master' into CLBlast-267-convgemm

2018-05-19 17:54:27 +02:00 · 2018-05-19 17:54:27 +02:00 · cbcd4ff7e8
parent e057a9186a 507d7bc729
commit cbcd4ff7e8
25 changed files with 169 additions and 93 deletions
--- a/3
+++ b/3
@ -6,8 +6,11 @@ Development (next version)
 - Improved the performance potential by adding a second tunable GEMM kernel with 2D register tiling
 - Added support for Intel specific subgroup shuffling extensions for faster GEMM on Intel GPUs
 - Re-added a local memory size constraint to the tuners
+- The routine tuners now automatically pick up tuning results from disk from the kernel tuners
 - Updated and reorganised the CLBlast documentation
+- Added a 'canary' region to check for overflows in the tuner and tests (insipred by clARMOR)
 - Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program
+- Fixed incorrect releasing of the OpenCL program resulting in segfaults / access violations
 - Various minor fixes and enhancements
 - Added tuned parameters for various devices (see doc/tuning.md)
 - Added non-BLAS level-1 routines:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -424,9 +424,9 @@ if(TUNERS)
    install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
  endforeach()
  foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
-    add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
+    add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp test/test_utilities.cpp)
    target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
-    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
+    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS} ${clblast_SOURCE_DIR})
    install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
  endforeach()

@ -439,6 +439,12 @@ if(TUNERS)
    endforeach()
    set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL})
  endforeach()
+  foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
+    foreach(PRECISION ${PRECISIONS})
+      set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION})
+    endforeach()
+    set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER})
+  endforeach()
  add_custom_target(alltuners ${ALLTUNERS} DEPENDS ${ALLTUNERSDEPENDS})

 endif()
--- a/README.md
+++ b/README.md
@ -4,9 +4,9 @@ CLBlast: The tuned OpenCL BLAS library

 | | Build status | Tests on Intel CPU | Tests on NVIDIA GPU | Tests on Intel GPU |
 |-----|-----|-----|-----|-----|
-| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://67.207.87.39:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://67.207.87.39:8010/#/builders/106) | [![Build Status](http://67.207.87.39:8010/badges/clblast-windows-nvidia-k5000.svg)](http://67.207.87.39:8010/#/builders/105) | [![Build Status](http://67.207.87.39:8010/badges/clblast-windows-intel-HD4600.svg)](http://67.207.87.39:8010/#/builders/107) |
-| Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://67.207.87.39:8010/badges/clblast-linux-intel-e5-2620-v4.svg)](http://67.207.87.39:8010/#/builders/97) | [![Build Status](http://67.207.87.39:8010/badges/clblast-linux-nvidia-k80.svg)](http://67.207.87.39:8010/#/builders/98) | N/A |
-| OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) |  [![Build Status](http://67.207.87.39:8010/badges/clblast-osx-intel-i5-4278U.svg)](http://67.207.87.39:8010/#/builders/110) | N/A | N/A |
+| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-i7-4790k) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-nvidia-k5000.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-nvidia-k5000) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-HD4600.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-HD4600) |
+| Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-intel-e5-2620-v4.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-intel-e5-2620-v4) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-k80.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-k80) | N/A |
+| OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) |  [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-osx-intel-i5-4278U.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-osx-intel-i5-4278U) | N/A | N/A |

 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. See [the CLBlast website](https://cnugteren.github.io/clblast) for performance reports on various devices as well as the latest CLBlast news.

@ -78,6 +78,7 @@ More detailed documentation is available in separate files:
 * [Tuning for better performance](doc/tuning.md)
 * [Testing the library for correctness](doc/testing.md)
 * [Bindings / wrappers for other languages](doc/bindings.md)
+* [More details on the GEMM kernel](doc/details_gemm.md)
 * [Glossary with some terms explained](doc/glossary.md)


@ -133,6 +134,7 @@ Tuning and testing on a variety of OpenCL devices was made possible by:

 Hardware/software for this project was contributed by:

+* [HPC research group at the University of Bristol](http://uob-hpc.github.io/zoo/) for access to their GPU zoo
 * [ArrayFire](http://arrayfire.org) for settings up and supporting Buildbot correctness tests on multiple platforms
 * [JetBrains](https://www.jetbrains.com/clion/) for supply a free CLion IDE license for CLBlast developers
 * [Travis CI](https://travis-ci.org/CNugteren/CLBlast/branches) and [AppVeyor](https://ci.appveyor.com/project/CNugteren/clblast) for free automated build tests for open-source projects
@ -143,8 +145,8 @@ More information

 Further information on CLBlast is available through the following links:

-* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf).
-* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017, updated April 2018). For CLTune, the inspiration for the included auto-tuner, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
+* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf). An updated version was also presented at IWOCL in May 2018. The slide set can be found [here as PDF](https://cnugteren.github.io/downloads/CLBlastIWOCL18.pdf).
+* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (v1 May 2017, updated to v2 in April 2018). For CLTune, the inspiration for the included auto-tuner, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.

 How to cite this work:

--- a/ROADMAP.md
+++ b/ROADMAP.md
@ -18,7 +18,8 @@ This file gives an overview of the main features planned for addition to CLBlast
 | [#223](https://github.com/CNugteren/CLBlast/issues/223)        | Feb '18     | CNugteren | ✔      | Python OpenCL interface |
 | [#237](https://github.com/CNugteren/CLBlast/issues/237)        | Mar '18     | CNugteren | ✔      | Making tuning possible from the CLBlast API |
 | [#228](https://github.com/CNugteren/CLBlast/issues/228)        | Mar-Apr '18 | CNugteren | ✔      | Improving performance for Qualcomm Adreno GPUs |
-| [#270](https://github.com/CNugteren/CLBlast/issues/270)        | May '18     | CNugteren |        | Implement col2im |
 | [#267](https://github.com/CNugteren/CLBlast/issues/267)        | May '18     | CNugteren |        | Merge im2col and GEMM into a direct kernel |
+| [#270](https://github.com/CNugteren/CLBlast/issues/270)        | July '18    | CNugteren |        | Implement col2im |
+| -                                                              | July '18    | CNugteren |        | Add a SYCL interface to the library |
 | [#136](https://github.com/CNugteren/CLBlast/issues/136)        | ??          | CNugteren |        | Implement xAXPBY and xSET |
 | [#169](https://github.com/CNugteren/CLBlast/issues/169)        | ??          | dividiti  |        | Problem-specific tuning parameter selection |
--- a/doc/details_gemm.md
+++ b/doc/details_gemm.md
@ -0,0 +1,27 @@
+CLBlast: Details on the GEMM routine and kernel
+================
+
+This document gives a bit more detail on how the GEMM routine is organised and implemented. For other information about CLBlast, see the [main README](../README.md).
+
+
+GEMM: Two approaches
+-------------
+
+CLBlast implements two approaches to GEMM: direct and indirect:
+
+* Direct GEMM: Computing GEMM using a single generic kernel which handles all cases (e.g. all kinds of matrix sizes).
+* Indirect GEMM: Computing GEMM using multiple kernels: the main GEMM kernel and a few pre-processing and post-processing kernels. The main kernel makes several assumptions (e.g. sizes need to be multiples of 32), which the other kernels make sure are satisfied. The main kernel is often faster than the generic kernel of the direct approach, but the cost of pre-processing and post-processing kernels can sometimes be high for small sizes or particular devices.
+
+
+GEMM: In-direct approach
+-------------
+
+Similar to the work by Matsumoto et al. ("Performance Tuning of Matrix Multiplication in OpenCL on Different GPUs and CPUs"), the main GEMM kernel makes many assumptions on the input arguments, which are handled by pre-processing and post-processing kernels. These assumptions are e.g. matrix sizes are a multiple of the work-group sizes, offsets are zero, and matrix B is transposed. This is a good solution for larger problem sizes since O(n^2) data movement is typically cheaper than O(n^3) computation, but the hidden constant starts to play a role for smaller n. Therefore, there is also a single-kernel direct version available for those cases, but it shares most of the design and parameters as discussed below.
+
+The main kernel has 14 different parameters, of which some are illustrated in figure 1 in the [CLBlast paper](https://arxiv.org/pdf/1705.05249). The parameters define among others the work-group sizes in 2 dimensions (MWG, NWG), the 2D register tiling configuration (MWI, NWI), the vector widths of both input matrices (VWM, VWN), loop unroll factors (KWI), and whether or not and how to use the local memory.
+
+
+GEMM: Direct approach
+-------------
+
+This is a single-kernel approach that shared many of the parameters for the in-direct kernel. One of the differences is that within the kernel there are checks for incomplete tiles in the m/n/k dimensions, influenced by the tuning parameters and the matrix sizes. These incomplete tiles will run a different part of the code, as they for example cannot benefit from vectorisation. Another difference is that there are dedicated kernels for each a/b transpose requirement: NN, NT, TN, TT for non-transposed and transposed.
--- a/doc/tuning.md
+++ b/doc/tuning.md
@ -82,7 +82,7 @@ Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clbla

 The kernels `gemm` and `gemm_direct` have too many parameters to explore. Therefore, they will run in two stages: a first stage with a fixed limited number of parameter combinations, and a second stage with a random selection from a much larger search space. The random fraction is determined by the `fraction` argument on the command-line.

-There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
+There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. However, they do automatically pick up kernel tuning results from the current folder if there are any. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.


 Using the tuning results
@ -100,8 +100,6 @@ In summary, tuning the entire library for your device can be done as follows (st
    python ../scripts/database/database.py . ..
    make

-After the kernels are tuned, you can run the `clblast_tuner_routine_xgemm` tuner to optimize the high-level GEMM routine, i.e. selecting which method to use: the direct kernel or the in-direct kernel.
-

 Tuning using the API (advanced users only)
 -------------
--- a/src/cache.cpp
+++ b/src/cache.cpp
@ -117,8 +117,8 @@ template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;

 // =================================================================================================

-template class Cache<ProgramKey, Program>;
-template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
+template class Cache<ProgramKey, std::shared_ptr<Program>>;
+template std::shared_ptr<Program> ProgramCache::Get(const ProgramKeyRef &, bool *) const;
 template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name

 // =================================================================================================
--- a/src/cache.hpp
+++ b/src/cache.hpp
@ -83,10 +83,10 @@ extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const
 typedef std::tuple<RawContext, RawDeviceID, Precision, std::string> ProgramKey;
 typedef std::tuple<const RawContext &, const RawDeviceID &, const Precision &, const std::string &> ProgramKeyRef;

-typedef Cache<ProgramKey, Program> ProgramCache;
+typedef Cache<ProgramKey, std::shared_ptr<Program>> ProgramCache;

-extern template class Cache<ProgramKey, Program>;
-extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
+extern template class Cache<ProgramKey, std::shared_ptr<Program>>;
+extern template std::shared_ptr<Program> ProgramCache::Get(const ProgramKeyRef &, bool *) const;

 // =================================================================================================

--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@ -437,47 +437,41 @@ using ContextPointer = cl_context*;
 // C++11 version of 'cl_program'.
 class Program {
 public:
-  Program() = default;

  // Source-based constructor with memory management
-  explicit Program(const Context &context, const std::string &source):
-      program_(new cl_program, [](cl_program* p) {
-        #ifndef _MSC_VER // 'clReleaseProgram' caused an access violation with Visual Studio
-          if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
-        #endif
-        delete p;
-      }) {
+  explicit Program(const Context &context, const std::string &source) {
    const char *source_ptr = &source[0];
    const auto length = source.length();
    auto status = CL_SUCCESS;
-    *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
+    program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
    CLCudaAPIError::Check(status, "clCreateProgramWithSource");
  }

  // Binary-based constructor with memory management
-  explicit Program(const Device &device, const Context &context, const std::string &binary):
-      program_(new cl_program, [](cl_program* p) {
-        if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
-        delete p;
-      }) {
+  explicit Program(const Device &device, const Context &context, const std::string &binary) {
    const char *binary_ptr = &binary[0];
    const auto length = binary.length();
    auto status1 = CL_SUCCESS;
    auto status2 = CL_SUCCESS;
    const auto dev = device();
-    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
+    program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
                                          reinterpret_cast<const unsigned char**>(&binary_ptr),
                                          &status1, &status2);
    CLCudaAPIError::Check(status1, "clCreateProgramWithBinary (binary status)");
    CLCudaAPIError::Check(status2, "clCreateProgramWithBinary");
  }

+  // Clean-up
+  ~Program() {
+    if (program_) { CheckErrorDtor(clReleaseProgram(program_)); }
+  }
+
  // Compiles the device program and checks whether or not there are any warnings/errors
  void Build(const Device &device, std::vector<std::string> &options) {
    options.push_back("-cl-std=CL1.1");
    auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
    const cl_device_id dev = device();
-    CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
+    CheckError(clBuildProgram(program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
  }

  // Confirms whether a certain status code is an actual compilation error or warning
@ -489,28 +483,28 @@ class Program {
  std::string GetBuildInfo(const Device &device) const {
    auto bytes = size_t{0};
    auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG};
-    CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes));
+    CheckError(clGetProgramBuildInfo(program_, device(), query, 0, nullptr, &bytes));
    auto result = std::string{};
    result.resize(bytes);
-    CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr));
+    CheckError(clGetProgramBuildInfo(program_, device(), query, bytes, &result[0], nullptr));
    return result;
  }

  // Retrieves a binary or an intermediate representation of the compiled program
  std::string GetIR() const {
    auto bytes = size_t{0};
-    CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
+    CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
    auto result = std::string{};
    result.resize(bytes);
    auto result_ptr = result.data();
-    CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
+    CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
    return result;
  }

  // Accessor to the private data-member
-  const cl_program& operator()() const { return *program_; }
+  const cl_program& operator()() const { return program_; }
 private:
-  std::shared_ptr<cl_program> program_;
+  cl_program program_ = nullptr;
 };

 // =================================================================================================
@ -757,13 +751,13 @@ class Kernel {
  }

  // Regular constructor with memory management
-  explicit Kernel(const Program &program, const std::string &name):
+  explicit Kernel(const std::shared_ptr<Program> program, const std::string &name):
      kernel_(new cl_kernel, [](cl_kernel* k) {
        if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
        delete k;
      }) {
    auto status = CL_SUCCESS;
-    *kernel_ = clCreateKernel(program(), name.c_str(), &status);
+    *kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status);
    CLCudaAPIError::Check(status, "clCreateKernel");
  }

--- a/src/routine.cpp
+++ b/src/routine.cpp
@ -96,10 +96,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
  auto binary = BinaryCache::Instance().Get(BinaryKeyRef{platform_id,  precision_, routine_info, device_name },
                                            &has_binary);
  if (has_binary) {
-    program_ = Program(device_, context_, binary);
-    program_.Build(device_, options);
+    program_ = std::make_shared<Program>(Program(device_, context_, binary));
+    program_->Build(device_, options);
    ProgramCache::Instance().Store(ProgramKey{ context_(), device_(), precision_, routine_info },
-                                   Program{ program_ });
+                                    std::shared_ptr<Program>{program_});
    return;
  }

@ -135,10 +135,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {

  // Store the compiled binary and program in the cache
  BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name},
-                                program_.GetIR());
+                                program_->GetIR());

  ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
-                                 Program{ program_ });
+                                 std::shared_ptr<Program>{program_});
 }

 // =================================================================================================
--- a/src/routine.hpp
+++ b/src/routine.hpp
@ -33,6 +33,7 @@ namespace clblast {
 class Routine {
 public:

+  // Initializes db_, fetching cached database or building one
  static void InitDatabase(const Device &device, const std::vector<std::string> &kernel_names,
                           const Precision precision, const std::vector<database::DatabaseEntry> &userDatabase,
                           Databases &db) {
@ -78,9 +79,6 @@ class Routine {
  // Initializes program_, fetching cached program or building one
  void InitProgram(std::initializer_list<const char *> source);

-  // Initializes db_, fetching cached database or building one
-  void InitDatabase(const std::vector<database::DatabaseEntry> &userDatabase);
-
 protected:

  // Non-static variable for the precision
@ -97,7 +95,7 @@ class Routine {
  const Device device_;

  // Compiled program (either retrieved from cache or compiled in slow path)
-  Program program_;
+  std::shared_ptr<Program> program_;

  // Connection to the database for all the device-specific parameters
  Databases db_;
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@ -77,7 +77,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 // Sets all elements of a matrix to a constant value
 template <typename T>
 void FillMatrix(Queue &queue, const Device &device,
-                const Program &program, const Databases &,
+                const std::shared_ptr<Program> program, const Databases &,
                EventPointer event, const std::vector<Event> &waitForEvents,
                const size_t m, const size_t n, const size_t ld, const size_t offset,
                const Buffer<T> &dest,
@ -95,26 +95,26 @@ void FillMatrix(Queue &queue, const Device &device,
 }

 // Compiles the above function
-template void FillMatrix<half>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                               EventPointer, const std::vector<Event>&, const size_t, const size_t,
                               const size_t, const size_t, const Buffer<half>&, const half);
-template void FillMatrix<float>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                const size_t, const size_t, const Buffer<float>&, const float);
-template void FillMatrix<double>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                 const size_t, const size_t, const Buffer<double>&, const double);
-template void FillMatrix<float2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                 const size_t, const size_t, const Buffer<float2>&, const float2);
-template void FillMatrix<double2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                  const size_t, const size_t, const Buffer<double2>&, const double2);

 // Sets all elements of a vector to a constant value
 template <typename T>
 void FillVector(Queue &queue, const Device &device,
-                const Program &program, const Databases &,
+                const std::shared_ptr<Program> program, const Databases &,
                EventPointer event, const std::vector<Event> &waitForEvents,
                const size_t n, const size_t inc, const size_t offset,
                const Buffer<T> &dest,
@ -131,19 +131,19 @@ void FillVector(Queue &queue, const Device &device,
 }

 // Compiles the above function
-template void FillVector<half>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                               EventPointer, const std::vector<Event>&, const size_t, const size_t,
                               const size_t, const Buffer<half>&, const half);
-template void FillVector<float>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                const size_t, const Buffer<float>&, const float);
-template void FillVector<double>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                 const size_t, const Buffer<double>&, const double);
-template void FillVector<float2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                 const size_t, const Buffer<float2>&, const float2);
-template void FillVector<double2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                  const size_t, const Buffer<double2>&, const double2);

--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@ -36,7 +36,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 // Sets all elements of a matrix to a constant value
 template <typename T>
 void FillMatrix(Queue &queue, const Device &device,
-                const Program &program, const Databases &,
+                const std::shared_ptr<Program> program, const Databases &,
                EventPointer event, const std::vector<Event> &waitForEvents,
                const size_t m, const size_t n, const size_t ld, const size_t offset,
                const Buffer<T> &dest,
@ -45,7 +45,7 @@ void FillMatrix(Queue &queue, const Device &device,
 // Sets all elements of a vector to a constant value
 template <typename T>
 void FillVector(Queue &queue, const Device &device,
-                const Program &program, const Databases &,
+                const std::shared_ptr<Program> program, const Databases &,
                EventPointer event, const std::vector<Event> &waitForEvents,
                const size_t n, const size_t inc, const size_t offset,
                const Buffer<T> &dest,
@ -66,7 +66,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
                            const size_t dest_ld, const size_t dest_offset,
                            const Buffer<T> &dest,
                            const T alpha,
-                            const Program &program, const bool do_pad,
+                            const std::shared_ptr<Program> program, const bool do_pad,
                            const bool do_transpose, const bool do_conjugate,
                            const bool upper = false, const bool lower = false,
                            const bool diagonal_imag_zero = false) {
@ -186,7 +186,7 @@ void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device,
                                   const size_t dest_one, const size_t dest_two,
                                   const size_t dest_ld, const Buffer<int> &dest_offsets,
                                   const Buffer<T> &dest,
-                                   const Program &program, const bool do_pad,
+                                   const std::shared_ptr<Program> program, const bool do_pad,
                                   const bool do_transpose, const bool do_conjugate,
                                   const size_t batch_count) {

@ -250,7 +250,7 @@ void PadCopyTransposeMatrixStridedBatched(Queue &queue, const Device &device,
                                          const size_t dest_one, const size_t dest_two,
                                          const size_t dest_ld, const size_t dest_offset,
                                          const size_t dest_stride, const Buffer<T> &dest,
-                                          const Program &program, const bool do_pad,
+                                          const std::shared_ptr<Program> program, const bool do_pad,
                                          const bool do_transpose, const bool do_conjugate,
                                          const size_t batch_count) {

--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@ -15,8 +15,10 @@
 #include <exception>
 #include <string>
 #include <vector>
+#include <iostream>

 #include "utilities/utilities.hpp"
+#include "test/test_utilities.hpp"
 #include "tuning/routines/routine_tuner.hpp"

 namespace clblast {
@ -101,6 +103,22 @@ void TuneXgemm(int argc, char* argv[]) {
  const auto context = Context(device);
  auto queue = Queue(context, device);

+  // Pre-load GEMM kernel tuning results if they exist
+  printf("* The GEMM routine tuner requires already tuned kernels\n");
+  printf("  Applying tuning results from disk if they exist...\n\n");
+  const auto kernel_names = {"xgemm_1", "xgemm_direct_1", "copy", "pad", "transpose", "padtranspose"};
+  for (const auto& kernel_name : kernel_names) {
+    const auto tuner_file_name = "clblast_" + std::string{kernel_name} + "_" +
+                                 ToString(static_cast<int>(precision)) + ".json";
+    printf("* Looking for tuning results in the current folder: '%s'\n", tuner_file_name.c_str());
+    if (std::ifstream(tuner_file_name)) { // Checks if the file exists on disk
+      OverrideParametersFromJSONFiles({tuner_file_name}, device(), precision);
+    }
+    else {
+      printf("  Not found: assuming the kernel '%s' is already tuned\n\n", kernel_name);
+    }
+  }
+
  // Run the tuners for the XGEMM routines
  TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
                         64, 2048, 64, 1, num_runs,
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@ -150,11 +150,11 @@ void Tuner(int argc, char* argv[], const int V,
  const auto device_architecture = GetDeviceArchitecture(device);
  const auto device_name = GetDeviceName(device);

-  // Creates input buffers with random data
+  // Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows.
  const auto buffer_sizes = std::vector<size_t>{
-      settings.size_x, settings.size_y,
-      settings.size_a, settings.size_b, settings.size_c,
-      settings.size_temp
+      settings.size_x + kCanarySize, settings.size_y + kCanarySize,
+      settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize,
+      settings.size_temp + kCanarySize
  };
  std::mt19937 mt(kSeed);
  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
--- a/src/tuning/tuning_api.cpp
+++ b/src/tuning/tuning_api.cpp
@ -241,11 +241,11 @@ StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
  const auto device_architecture = GetDeviceArchitecture(device);
  const auto device_name = GetDeviceName(device);

-  // Creates input buffers with random data
+  // Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows.
  const auto buffer_sizes = std::vector<size_t>{
-      settings.size_x, settings.size_y,
-      settings.size_a, settings.size_b, settings.size_c,
-      settings.size_temp
+      settings.size_x + kCanarySize, settings.size_y + kCanarySize,
+      settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize,
+      settings.size_temp + kCanarySize
  };
  const auto seed = static_cast<unsigned long>(time(nullptr));
  std::mt19937 mt(seed);
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@ -21,7 +21,8 @@ namespace clblast {
 // =================================================================================================

 // Compiles a program from source code
-Program CompileFromSource(const std::string &source_string, const Precision precision,
+std::shared_ptr<Program> CompileFromSource(
+                          const std::string &source_string, const Precision precision,
                          const std::string &routine_name,
                          const Device& device, const Context& context,
                          std::vector<std::string>& options,
@ -93,13 +94,13 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
  }

  // Compiles the kernel
-  auto program = Program(context, kernel_string);
+  auto program = std::make_shared<Program>(context, kernel_string);
  try {
-    program.Build(device, options);
+    program->Build(device, options);
  } catch (const CLCudaAPIBuildError &e) {
-    if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) {
+    if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) {
      fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
-              program.GetBuildInfo(device).c_str());
+              program->GetBuildInfo(device).c_str());
    }
    throw;
  }
--- a/src/utilities/compile.hpp
+++ b/src/utilities/compile.hpp
@ -24,7 +24,8 @@ namespace clblast {
 // =================================================================================================

 // Compiles a program from source code
-Program CompileFromSource(const std::string &source_string, const Precision precision,
+std::shared_ptr<Program> CompileFromSource(
+                          const std::string &source_string, const Precision precision,
                          const std::string &routine_name,
                          const Device& device, const Context& context,
                          std::vector<std::string>& options,
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@ -52,6 +52,9 @@ const std::string kKhronosIntelSubgroups = "cl_intel_subgroups";
 // Catched an unknown error
 constexpr auto kUnknownError = -999;

+// Canary size to add to buffers to check for buffer overflows
+constexpr auto kCanarySize = 127;
+
 // =================================================================================================

 // The routine-specific arguments in string form
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@ -66,14 +66,14 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si
  const auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
  const auto max_batch_count = *std::max_element(kBatchCounts.begin(), kBatchCounts.end());

-  // Creates test input data
-  x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
-  y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
-  a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
-  scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset);
+  // Creates test input data. Adds a 'canary' region to detect buffer overflows
+  x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset + kCanarySize);
+  y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset + kCanarySize);
+  a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize);
+  b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize);
+  c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize);
+  ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset + kCanarySize);
+  scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset + kCanarySize);
  std::mt19937 mt(kSeed);
  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
  PopulateVector(x_source_, mt, dist);
@ -94,7 +94,16 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
  TestStart("regular behaviour", name);

  // Iterates over all the to-be-tested combinations of arguments
-  for (const auto &args: test_vector) {
+  for (auto &args: test_vector) {
+
+    // Adds a 'canary' region to detect buffer overflows
+    args.x_size += kCanarySize;
+    args.y_size += kCanarySize;
+    args.a_size += kCanarySize;
+    args.b_size += kCanarySize;
+    args.c_size += kCanarySize;
+    args.ap_size += kCanarySize;
+    args.scalar_size += kCanarySize;

    // Prints the current test configuration
    if (verbose_) {
@ -209,6 +218,20 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
        }
      }
    }
+    // Checks for differences in the 'canary' region to detect buffer overflows
+    for (auto canary_id=size_t{0}; canary_id<kCanarySize; ++canary_id) {
+      auto index = get_index_(args, get_id1_(args) - 1, get_id2_(args) - 1) + canary_id;
+      if (!TestSimilarity(result1[index], result2[index])) {
+        errors++;
+        if (verbose_) {
+          if (get_id2_(args) == 1) { std::cout << std::endl << "   Buffer overflow index " << index << ": "; }
+          else { std::cout << std::endl << "   Buffer overflow " << index << ": "; }
+          std::cout << " " << ToString(result1[index]) << " (reference) versus ";
+          std::cout << " " << ToString(result2[index]) << " (CLBlast)";
+        }
+      }
+    }
+

    // Report the results
    if (verbose_ && errors > 0) {
--- a/test/routines/level2/xhpr.hpp
+++ b/test/routines/level2/xhpr.hpp
@ -139,7 +139,7 @@ class TestXhpr {
  }

  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<U> &args) { return args.ap_size - args.ap_offset; }
+  static size_t ResultID1(const Arguments<U> &args) { return GetSizeAP(args) - args.ap_offset; }
  static size_t ResultID2(const Arguments<U> &) { return 1; } // N/A for this routine
  static size_t GetResultIndex(const Arguments<U> &args, const size_t id1, const size_t) {
    return id1 + args.ap_offset;
--- a/test/routines/level2/xhpr2.hpp
+++ b/test/routines/level2/xhpr2.hpp
@ -148,7 +148,7 @@ class TestXhpr2 {
  }

  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.ap_size - args.ap_offset; }
+  static size_t ResultID1(const Arguments<T> &args) { return GetSizeAP(args) - args.ap_offset; }
  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
    return id1 + args.ap_offset;
--- a/test/routines/level2/xspr.hpp
+++ b/test/routines/level2/xspr.hpp
@ -139,7 +139,7 @@ class TestXspr {
  }

  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.ap_size - args.ap_offset; }
+  static size_t ResultID1(const Arguments<T> &args) { return GetSizeAP(args) - args.ap_offset; }
  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
    return id1 + args.ap_offset;
--- a/test/routines/level2/xspr2.hpp
+++ b/test/routines/level2/xspr2.hpp
@ -148,7 +148,7 @@ class TestXspr2 {
  }

  // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.ap_size - args.ap_offset; }
+  static size_t ResultID1(const Arguments<T> &args) { return GetSizeAP(args) - args.ap_offset; }
  static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
    return id1 + args.ap_offset;
--- a/test/test_utilities.cpp
+++ b/test/test_utilities.cpp
@ -171,6 +171,7 @@ void GetBestParametersFromJSONFile(const std::string& file_name,
      kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '1'), kernel_family.end());
      kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '2'), kernel_family.end());
      kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '3'), kernel_family.end());
+      if (kernel_family == "Xgemmdirect") { kernel_family = "XgemmDirect"; } // more kinds of mismatches
    }

    // Retrieves the best-parameters and sets the override