Merge branch 'master' into CLBlast-267-convgemm

pull/319/head
Cedric Nugteren 2018-05-19 17:54:27 +02:00
commit cbcd4ff7e8
25 changed files with 169 additions and 93 deletions

View File

@ -6,8 +6,11 @@ Development (next version)
- Improved the performance potential by adding a second tunable GEMM kernel with 2D register tiling
- Added support for Intel specific subgroup shuffling extensions for faster GEMM on Intel GPUs
- Re-added a local memory size constraint to the tuners
- The routine tuners now automatically pick up tuning results from disk from the kernel tuners
- Updated and reorganised the CLBlast documentation
- Added a 'canary' region to check for overflows in the tuner and tests (insipred by clARMOR)
- Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program
- Fixed incorrect releasing of the OpenCL program resulting in segfaults / access violations
- Various minor fixes and enhancements
- Added tuned parameters for various devices (see doc/tuning.md)
- Added non-BLAS level-1 routines:

View File

@ -424,9 +424,9 @@ if(TUNERS)
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
endforeach()
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp test/test_utilities.cpp)
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS} ${clblast_SOURCE_DIR})
install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
endforeach()
@ -439,6 +439,12 @@ if(TUNERS)
endforeach()
set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL})
endforeach()
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
foreach(PRECISION ${PRECISIONS})
set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION})
endforeach()
set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER})
endforeach()
add_custom_target(alltuners ${ALLTUNERS} DEPENDS ${ALLTUNERSDEPENDS})
endif()

View File

@ -4,9 +4,9 @@ CLBlast: The tuned OpenCL BLAS library
| | Build status | Tests on Intel CPU | Tests on NVIDIA GPU | Tests on Intel GPU |
|-----|-----|-----|-----|-----|
| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://67.207.87.39:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://67.207.87.39:8010/#/builders/106) | [![Build Status](http://67.207.87.39:8010/badges/clblast-windows-nvidia-k5000.svg)](http://67.207.87.39:8010/#/builders/105) | [![Build Status](http://67.207.87.39:8010/badges/clblast-windows-intel-HD4600.svg)](http://67.207.87.39:8010/#/builders/107) |
| Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://67.207.87.39:8010/badges/clblast-linux-intel-e5-2620-v4.svg)](http://67.207.87.39:8010/#/builders/97) | [![Build Status](http://67.207.87.39:8010/badges/clblast-linux-nvidia-k80.svg)](http://67.207.87.39:8010/#/builders/98) | N/A |
| OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://67.207.87.39:8010/badges/clblast-osx-intel-i5-4278U.svg)](http://67.207.87.39:8010/#/builders/110) | N/A | N/A |
| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-i7-4790k) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-nvidia-k5000.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-nvidia-k5000) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-HD4600.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-HD4600) |
| Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-intel-e5-2620-v4.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-intel-e5-2620-v4) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-k80.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-k80) | N/A |
| OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-osx-intel-i5-4278U.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-osx-intel-i5-4278U) | N/A | N/A |
CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. See [the CLBlast website](https://cnugteren.github.io/clblast) for performance reports on various devices as well as the latest CLBlast news.
@ -78,6 +78,7 @@ More detailed documentation is available in separate files:
* [Tuning for better performance](doc/tuning.md)
* [Testing the library for correctness](doc/testing.md)
* [Bindings / wrappers for other languages](doc/bindings.md)
* [More details on the GEMM kernel](doc/details_gemm.md)
* [Glossary with some terms explained](doc/glossary.md)
@ -133,6 +134,7 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
Hardware/software for this project was contributed by:
* [HPC research group at the University of Bristol](http://uob-hpc.github.io/zoo/) for access to their GPU zoo
* [ArrayFire](http://arrayfire.org) for settings up and supporting Buildbot correctness tests on multiple platforms
* [JetBrains](https://www.jetbrains.com/clion/) for supply a free CLion IDE license for CLBlast developers
* [Travis CI](https://travis-ci.org/CNugteren/CLBlast/branches) and [AppVeyor](https://ci.appveyor.com/project/CNugteren/clblast) for free automated build tests for open-source projects
@ -143,8 +145,8 @@ More information
Further information on CLBlast is available through the following links:
* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf).
* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017, updated April 2018). For CLTune, the inspiration for the included auto-tuner, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf). An updated version was also presented at IWOCL in May 2018. The slide set can be found [here as PDF](https://cnugteren.github.io/downloads/CLBlastIWOCL18.pdf).
* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (v1 May 2017, updated to v2 in April 2018). For CLTune, the inspiration for the included auto-tuner, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
How to cite this work:

View File

@ -18,7 +18,8 @@ This file gives an overview of the main features planned for addition to CLBlast
| [#223](https://github.com/CNugteren/CLBlast/issues/223) | Feb '18 | CNugteren | ✔ | Python OpenCL interface |
| [#237](https://github.com/CNugteren/CLBlast/issues/237) | Mar '18 | CNugteren | ✔ | Making tuning possible from the CLBlast API |
| [#228](https://github.com/CNugteren/CLBlast/issues/228) | Mar-Apr '18 | CNugteren | ✔ | Improving performance for Qualcomm Adreno GPUs |
| [#270](https://github.com/CNugteren/CLBlast/issues/270) | May '18 | CNugteren | | Implement col2im |
| [#267](https://github.com/CNugteren/CLBlast/issues/267) | May '18 | CNugteren | | Merge im2col and GEMM into a direct kernel |
| [#270](https://github.com/CNugteren/CLBlast/issues/270) | July '18 | CNugteren | | Implement col2im |
| - | July '18 | CNugteren | | Add a SYCL interface to the library |
| [#136](https://github.com/CNugteren/CLBlast/issues/136) | ?? | CNugteren | | Implement xAXPBY and xSET |
| [#169](https://github.com/CNugteren/CLBlast/issues/169) | ?? | dividiti | | Problem-specific tuning parameter selection |

View File

@ -0,0 +1,27 @@
CLBlast: Details on the GEMM routine and kernel
================
This document gives a bit more detail on how the GEMM routine is organised and implemented. For other information about CLBlast, see the [main README](../README.md).
GEMM: Two approaches
-------------
CLBlast implements two approaches to GEMM: direct and indirect:
* Direct GEMM: Computing GEMM using a single generic kernel which handles all cases (e.g. all kinds of matrix sizes).
* Indirect GEMM: Computing GEMM using multiple kernels: the main GEMM kernel and a few pre-processing and post-processing kernels. The main kernel makes several assumptions (e.g. sizes need to be multiples of 32), which the other kernels make sure are satisfied. The main kernel is often faster than the generic kernel of the direct approach, but the cost of pre-processing and post-processing kernels can sometimes be high for small sizes or particular devices.
GEMM: In-direct approach
-------------
Similar to the work by Matsumoto et al. ("Performance Tuning of Matrix Multiplication in OpenCL on Different GPUs and CPUs"), the main GEMM kernel makes many assumptions on the input arguments, which are handled by pre-processing and post-processing kernels. These assumptions are e.g. matrix sizes are a multiple of the work-group sizes, offsets are zero, and matrix B is transposed. This is a good solution for larger problem sizes since O(n^2) data movement is typically cheaper than O(n^3) computation, but the hidden constant starts to play a role for smaller n. Therefore, there is also a single-kernel direct version available for those cases, but it shares most of the design and parameters as discussed below.
The main kernel has 14 different parameters, of which some are illustrated in figure 1 in the [CLBlast paper](https://arxiv.org/pdf/1705.05249). The parameters define among others the work-group sizes in 2 dimensions (MWG, NWG), the 2D register tiling configuration (MWI, NWI), the vector widths of both input matrices (VWM, VWN), loop unroll factors (KWI), and whether or not and how to use the local memory.
GEMM: Direct approach
-------------
This is a single-kernel approach that shared many of the parameters for the in-direct kernel. One of the differences is that within the kernel there are checks for incomplete tiles in the m/n/k dimensions, influenced by the tuning parameters and the matrix sizes. These incomplete tiles will run a different part of the code, as they for example cannot benefit from vectorisation. Another difference is that there are dedicated kernels for each a/b transpose requirement: NN, NT, TN, TT for non-transposed and transposed.

View File

@ -82,7 +82,7 @@ Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clbla
The kernels `gemm` and `gemm_direct` have too many parameters to explore. Therefore, they will run in two stages: a first stage with a fixed limited number of parameter combinations, and a second stage with a random selection from a much larger search space. The random fraction is determined by the `fraction` argument on the command-line.
There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. However, they do automatically pick up kernel tuning results from the current folder if there are any. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
Using the tuning results
@ -100,8 +100,6 @@ In summary, tuning the entire library for your device can be done as follows (st
python ../scripts/database/database.py . ..
make
After the kernels are tuned, you can run the `clblast_tuner_routine_xgemm` tuner to optimize the high-level GEMM routine, i.e. selecting which method to use: the direct kernel or the in-direct kernel.
Tuning using the API (advanced users only)
-------------

View File

@ -117,8 +117,8 @@ template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
// =================================================================================================
template class Cache<ProgramKey, Program>;
template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
template class Cache<ProgramKey, std::shared_ptr<Program>>;
template std::shared_ptr<Program> ProgramCache::Get(const ProgramKeyRef &, bool *) const;
template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name
// =================================================================================================

View File

@ -83,10 +83,10 @@ extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const
typedef std::tuple<RawContext, RawDeviceID, Precision, std::string> ProgramKey;
typedef std::tuple<const RawContext &, const RawDeviceID &, const Precision &, const std::string &> ProgramKeyRef;
typedef Cache<ProgramKey, Program> ProgramCache;
typedef Cache<ProgramKey, std::shared_ptr<Program>> ProgramCache;
extern template class Cache<ProgramKey, Program>;
extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
extern template class Cache<ProgramKey, std::shared_ptr<Program>>;
extern template std::shared_ptr<Program> ProgramCache::Get(const ProgramKeyRef &, bool *) const;
// =================================================================================================

View File

@ -437,47 +437,41 @@ using ContextPointer = cl_context*;
// C++11 version of 'cl_program'.
class Program {
public:
Program() = default;
// Source-based constructor with memory management
explicit Program(const Context &context, const std::string &source):
program_(new cl_program, [](cl_program* p) {
#ifndef _MSC_VER // 'clReleaseProgram' caused an access violation with Visual Studio
if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
#endif
delete p;
}) {
explicit Program(const Context &context, const std::string &source) {
const char *source_ptr = &source[0];
const auto length = source.length();
auto status = CL_SUCCESS;
*program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
CLCudaAPIError::Check(status, "clCreateProgramWithSource");
}
// Binary-based constructor with memory management
explicit Program(const Device &device, const Context &context, const std::string &binary):
program_(new cl_program, [](cl_program* p) {
if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
delete p;
}) {
explicit Program(const Device &device, const Context &context, const std::string &binary) {
const char *binary_ptr = &binary[0];
const auto length = binary.length();
auto status1 = CL_SUCCESS;
auto status2 = CL_SUCCESS;
const auto dev = device();
*program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
reinterpret_cast<const unsigned char**>(&binary_ptr),
&status1, &status2);
CLCudaAPIError::Check(status1, "clCreateProgramWithBinary (binary status)");
CLCudaAPIError::Check(status2, "clCreateProgramWithBinary");
}
// Clean-up
~Program() {
if (program_) { CheckErrorDtor(clReleaseProgram(program_)); }
}
// Compiles the device program and checks whether or not there are any warnings/errors
void Build(const Device &device, std::vector<std::string> &options) {
options.push_back("-cl-std=CL1.1");
auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
const cl_device_id dev = device();
CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
CheckError(clBuildProgram(program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
}
// Confirms whether a certain status code is an actual compilation error or warning
@ -489,28 +483,28 @@ class Program {
std::string GetBuildInfo(const Device &device) const {
auto bytes = size_t{0};
auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG};
CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes));
CheckError(clGetProgramBuildInfo(program_, device(), query, 0, nullptr, &bytes));
auto result = std::string{};
result.resize(bytes);
CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr));
CheckError(clGetProgramBuildInfo(program_, device(), query, bytes, &result[0], nullptr));
return result;
}
// Retrieves a binary or an intermediate representation of the compiled program
std::string GetIR() const {
auto bytes = size_t{0};
CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
auto result = std::string{};
result.resize(bytes);
auto result_ptr = result.data();
CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
return result;
}
// Accessor to the private data-member
const cl_program& operator()() const { return *program_; }
const cl_program& operator()() const { return program_; }
private:
std::shared_ptr<cl_program> program_;
cl_program program_ = nullptr;
};
// =================================================================================================
@ -757,13 +751,13 @@ class Kernel {
}
// Regular constructor with memory management
explicit Kernel(const Program &program, const std::string &name):
explicit Kernel(const std::shared_ptr<Program> program, const std::string &name):
kernel_(new cl_kernel, [](cl_kernel* k) {
if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
delete k;
}) {
auto status = CL_SUCCESS;
*kernel_ = clCreateKernel(program(), name.c_str(), &status);
*kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status);
CLCudaAPIError::Check(status, "clCreateKernel");
}

View File

@ -96,10 +96,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
auto binary = BinaryCache::Instance().Get(BinaryKeyRef{platform_id, precision_, routine_info, device_name },
&has_binary);
if (has_binary) {
program_ = Program(device_, context_, binary);
program_.Build(device_, options);
program_ = std::make_shared<Program>(Program(device_, context_, binary));
program_->Build(device_, options);
ProgramCache::Instance().Store(ProgramKey{ context_(), device_(), precision_, routine_info },
Program{ program_ });
std::shared_ptr<Program>{program_});
return;
}
@ -135,10 +135,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
// Store the compiled binary and program in the cache
BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name},
program_.GetIR());
program_->GetIR());
ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
Program{ program_ });
std::shared_ptr<Program>{program_});
}
// =================================================================================================

View File

@ -33,6 +33,7 @@ namespace clblast {
class Routine {
public:
// Initializes db_, fetching cached database or building one
static void InitDatabase(const Device &device, const std::vector<std::string> &kernel_names,
const Precision precision, const std::vector<database::DatabaseEntry> &userDatabase,
Databases &db) {
@ -78,9 +79,6 @@ class Routine {
// Initializes program_, fetching cached program or building one
void InitProgram(std::initializer_list<const char *> source);
// Initializes db_, fetching cached database or building one
void InitDatabase(const std::vector<database::DatabaseEntry> &userDatabase);
protected:
// Non-static variable for the precision
@ -97,7 +95,7 @@ class Routine {
const Device device_;
// Compiled program (either retrieved from cache or compiled in slow path)
Program program_;
std::shared_ptr<Program> program_;
// Connection to the database for all the device-specific parameters
Databases db_;

View File

@ -77,7 +77,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
const Program &program, const Databases &,
const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t m, const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
@ -95,26 +95,26 @@ void FillMatrix(Queue &queue, const Device &device,
}
// Compiles the above function
template void FillMatrix<half>(Queue&, const Device&, const Program&, const Databases&,
template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<half>&, const half);
template void FillMatrix<float>(Queue&, const Device&, const Program&, const Databases&,
template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<float>&, const float);
template void FillMatrix<double>(Queue&, const Device&, const Program&, const Databases&,
template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<double>&, const double);
template void FillMatrix<float2>(Queue&, const Device&, const Program&, const Databases&,
template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<float2>&, const float2);
template void FillMatrix<double2>(Queue&, const Device&, const Program&, const Databases&,
template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<double2>&, const double2);
// Sets all elements of a vector to a constant value
template <typename T>
void FillVector(Queue &queue, const Device &device,
const Program &program, const Databases &,
const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t inc, const size_t offset,
const Buffer<T> &dest,
@ -131,19 +131,19 @@ void FillVector(Queue &queue, const Device &device,
}
// Compiles the above function
template void FillVector<half>(Queue&, const Device&, const Program&, const Databases&,
template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<half>&, const half);
template void FillVector<float>(Queue&, const Device&, const Program&, const Databases&,
template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<float>&, const float);
template void FillVector<double>(Queue&, const Device&, const Program&, const Databases&,
template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<double>&, const double);
template void FillVector<float2>(Queue&, const Device&, const Program&, const Databases&,
template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<float2>&, const float2);
template void FillVector<double2>(Queue&, const Device&, const Program&, const Databases&,
template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<double2>&, const double2);

View File

@ -36,7 +36,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
const Program &program, const Databases &,
const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t m, const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
@ -45,7 +45,7 @@ void FillMatrix(Queue &queue, const Device &device,
// Sets all elements of a vector to a constant value
template <typename T>
void FillVector(Queue &queue, const Device &device,
const Program &program, const Databases &,
const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t inc, const size_t offset,
const Buffer<T> &dest,
@ -66,7 +66,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const std::shared_ptr<Program> program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false) {
@ -186,7 +186,7 @@ void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const Buffer<int> &dest_offsets,
const Buffer<T> &dest,
const Program &program, const bool do_pad,
const std::shared_ptr<Program> program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const size_t batch_count) {
@ -250,7 +250,7 @@ void PadCopyTransposeMatrixStridedBatched(Queue &queue, const Device &device,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const size_t dest_stride, const Buffer<T> &dest,
const Program &program, const bool do_pad,
const std::shared_ptr<Program> program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const size_t batch_count) {

View File

@ -15,8 +15,10 @@
#include <exception>
#include <string>
#include <vector>
#include <iostream>
#include "utilities/utilities.hpp"
#include "test/test_utilities.hpp"
#include "tuning/routines/routine_tuner.hpp"
namespace clblast {
@ -101,6 +103,22 @@ void TuneXgemm(int argc, char* argv[]) {
const auto context = Context(device);
auto queue = Queue(context, device);
// Pre-load GEMM kernel tuning results if they exist
printf("* The GEMM routine tuner requires already tuned kernels\n");
printf(" Applying tuning results from disk if they exist...\n\n");
const auto kernel_names = {"xgemm_1", "xgemm_direct_1", "copy", "pad", "transpose", "padtranspose"};
for (const auto& kernel_name : kernel_names) {
const auto tuner_file_name = "clblast_" + std::string{kernel_name} + "_" +
ToString(static_cast<int>(precision)) + ".json";
printf("* Looking for tuning results in the current folder: '%s'\n", tuner_file_name.c_str());
if (std::ifstream(tuner_file_name)) { // Checks if the file exists on disk
OverrideParametersFromJSONFiles({tuner_file_name}, device(), precision);
}
else {
printf(" Not found: assuming the kernel '%s' is already tuned\n\n", kernel_name);
}
}
// Run the tuners for the XGEMM routines
TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
64, 2048, 64, 1, num_runs,

View File

@ -150,11 +150,11 @@ void Tuner(int argc, char* argv[], const int V,
const auto device_architecture = GetDeviceArchitecture(device);
const auto device_name = GetDeviceName(device);
// Creates input buffers with random data
// Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows.
const auto buffer_sizes = std::vector<size_t>{
settings.size_x, settings.size_y,
settings.size_a, settings.size_b, settings.size_c,
settings.size_temp
settings.size_x + kCanarySize, settings.size_y + kCanarySize,
settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize,
settings.size_temp + kCanarySize
};
std::mt19937 mt(kSeed);
std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);

View File

@ -241,11 +241,11 @@ StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
const auto device_architecture = GetDeviceArchitecture(device);
const auto device_name = GetDeviceName(device);
// Creates input buffers with random data
// Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows.
const auto buffer_sizes = std::vector<size_t>{
settings.size_x, settings.size_y,
settings.size_a, settings.size_b, settings.size_c,
settings.size_temp
settings.size_x + kCanarySize, settings.size_y + kCanarySize,
settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize,
settings.size_temp + kCanarySize
};
const auto seed = static_cast<unsigned long>(time(nullptr));
std::mt19937 mt(seed);

View File

@ -21,7 +21,8 @@ namespace clblast {
// =================================================================================================
// Compiles a program from source code
Program CompileFromSource(const std::string &source_string, const Precision precision,
std::shared_ptr<Program> CompileFromSource(
const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options,
@ -93,13 +94,13 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
}
// Compiles the kernel
auto program = Program(context, kernel_string);
auto program = std::make_shared<Program>(context, kernel_string);
try {
program.Build(device, options);
program->Build(device, options);
} catch (const CLCudaAPIBuildError &e) {
if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) {
if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) {
fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
program.GetBuildInfo(device).c_str());
program->GetBuildInfo(device).c_str());
}
throw;
}

View File

@ -24,7 +24,8 @@ namespace clblast {
// =================================================================================================
// Compiles a program from source code
Program CompileFromSource(const std::string &source_string, const Precision precision,
std::shared_ptr<Program> CompileFromSource(
const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options,

View File

@ -52,6 +52,9 @@ const std::string kKhronosIntelSubgroups = "cl_intel_subgroups";
// Catched an unknown error
constexpr auto kUnknownError = -999;
// Canary size to add to buffers to check for buffer overflows
constexpr auto kCanarySize = 127;
// =================================================================================================
// The routine-specific arguments in string form

View File

@ -66,14 +66,14 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si
const auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
const auto max_batch_count = *std::max_element(kBatchCounts.begin(), kBatchCounts.end());
// Creates test input data
x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset);
// Creates test input data. Adds a 'canary' region to detect buffer overflows
x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset + kCanarySize);
y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset + kCanarySize);
a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize);
b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize);
c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize);
ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset + kCanarySize);
scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset + kCanarySize);
std::mt19937 mt(kSeed);
std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
PopulateVector(x_source_, mt, dist);
@ -94,7 +94,16 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
TestStart("regular behaviour", name);
// Iterates over all the to-be-tested combinations of arguments
for (const auto &args: test_vector) {
for (auto &args: test_vector) {
// Adds a 'canary' region to detect buffer overflows
args.x_size += kCanarySize;
args.y_size += kCanarySize;
args.a_size += kCanarySize;
args.b_size += kCanarySize;
args.c_size += kCanarySize;
args.ap_size += kCanarySize;
args.scalar_size += kCanarySize;
// Prints the current test configuration
if (verbose_) {
@ -209,6 +218,20 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
}
}
}
// Checks for differences in the 'canary' region to detect buffer overflows
for (auto canary_id=size_t{0}; canary_id<kCanarySize; ++canary_id) {
auto index = get_index_(args, get_id1_(args) - 1, get_id2_(args) - 1) + canary_id;
if (!TestSimilarity(result1[index], result2[index])) {
errors++;
if (verbose_) {
if (get_id2_(args) == 1) { std::cout << std::endl << " Buffer overflow index " << index << ": "; }
else { std::cout << std::endl << " Buffer overflow " << index << ": "; }
std::cout << " " << ToString(result1[index]) << " (reference) versus ";
std::cout << " " << ToString(result2[index]) << " (CLBlast)";
}
}
}
// Report the results
if (verbose_ && errors > 0) {

View File

@ -139,7 +139,7 @@ class TestXhpr {
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<U> &args) { return args.ap_size - args.ap_offset; }
static size_t ResultID1(const Arguments<U> &args) { return GetSizeAP(args) - args.ap_offset; }
static size_t ResultID2(const Arguments<U> &) { return 1; } // N/A for this routine
static size_t GetResultIndex(const Arguments<U> &args, const size_t id1, const size_t) {
return id1 + args.ap_offset;

View File

@ -148,7 +148,7 @@ class TestXhpr2 {
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &args) { return args.ap_size - args.ap_offset; }
static size_t ResultID1(const Arguments<T> &args) { return GetSizeAP(args) - args.ap_offset; }
static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
return id1 + args.ap_offset;

View File

@ -139,7 +139,7 @@ class TestXspr {
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &args) { return args.ap_size - args.ap_offset; }
static size_t ResultID1(const Arguments<T> &args) { return GetSizeAP(args) - args.ap_offset; }
static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
return id1 + args.ap_offset;

View File

@ -148,7 +148,7 @@ class TestXspr2 {
}
// Describes how to compute the indices of the result buffer
static size_t ResultID1(const Arguments<T> &args) { return args.ap_size - args.ap_offset; }
static size_t ResultID1(const Arguments<T> &args) { return GetSizeAP(args) - args.ap_offset; }
static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
return id1 + args.ap_offset;

View File

@ -171,6 +171,7 @@ void GetBestParametersFromJSONFile(const std::string& file_name,
kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '1'), kernel_family.end());
kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '2'), kernel_family.end());
kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '3'), kernel_family.end());
if (kernel_family == "Xgemmdirect") { kernel_family = "XgemmDirect"; } // more kinds of mismatches
}
// Retrieves the best-parameters and sets the override