Now stores a shared_ptr to the Program class in the cache

pull/282/head
Cedric Nugteren 2018-05-01 20:34:48 +02:00
parent b2248a17ae
commit 8258321a74
10 changed files with 56 additions and 62 deletions

View File

@ -7,7 +7,7 @@ Development (next version)
- Added support for Intel specific subgroup shuffling extensions for faster GEMM on Intel GPUs
- Re-added a local memory size constraint to the tuners
- Updated and reorganised the CLBlast documentation
- Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program
- Fixed incorrect releasing of the OpenCL program resulting in segfaults / access violations
- Various minor fixes and enhancements
- Added tuned parameters for various devices (see doc/tuning.md)
- Added non-BLAS level-1 routines:

View File

@ -117,8 +117,8 @@ template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
// =================================================================================================
template class Cache<ProgramKey, Program>;
template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
template class Cache<ProgramKey, std::shared_ptr<Program>>;
template std::shared_ptr<Program> ProgramCache::Get(const ProgramKeyRef &, bool *) const;
template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name
// =================================================================================================

View File

@ -83,10 +83,10 @@ extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const
typedef std::tuple<RawContext, RawDeviceID, Precision, std::string> ProgramKey;
typedef std::tuple<const RawContext &, const RawDeviceID &, const Precision &, const std::string &> ProgramKeyRef;
typedef Cache<ProgramKey, Program> ProgramCache;
typedef Cache<ProgramKey, std::shared_ptr<Program>> ProgramCache;
extern template class Cache<ProgramKey, Program>;
extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
extern template class Cache<ProgramKey, std::shared_ptr<Program>>;
extern template std::shared_ptr<Program> ProgramCache::Get(const ProgramKeyRef &, bool *) const;
// =================================================================================================

View File

@ -437,47 +437,41 @@ using ContextPointer = cl_context*;
// C++11 version of 'cl_program'.
class Program {
public:
Program() = default;
// Source-based constructor with memory management
explicit Program(const Context &context, const std::string &source):
program_(new cl_program, [](cl_program* p) {
#ifndef _MSC_VER // 'clReleaseProgram' caused an access violation with Visual Studio
if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
#endif
delete p;
}) {
explicit Program(const Context &context, const std::string &source) {
const char *source_ptr = &source[0];
const auto length = source.length();
auto status = CL_SUCCESS;
*program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
CLCudaAPIError::Check(status, "clCreateProgramWithSource");
}
// Binary-based constructor with memory management
explicit Program(const Device &device, const Context &context, const std::string &binary):
program_(new cl_program, [](cl_program* p) {
if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
delete p;
}) {
explicit Program(const Device &device, const Context &context, const std::string &binary) {
const char *binary_ptr = &binary[0];
const auto length = binary.length();
auto status1 = CL_SUCCESS;
auto status2 = CL_SUCCESS;
const auto dev = device();
*program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
reinterpret_cast<const unsigned char**>(&binary_ptr),
&status1, &status2);
CLCudaAPIError::Check(status1, "clCreateProgramWithBinary (binary status)");
CLCudaAPIError::Check(status2, "clCreateProgramWithBinary");
}
// Clean-up
~Program() {
if (program_) { CheckErrorDtor(clReleaseProgram(program_)); }
}
// Compiles the device program and checks whether or not there are any warnings/errors
void Build(const Device &device, std::vector<std::string> &options) {
options.push_back("-cl-std=CL1.1");
auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
const cl_device_id dev = device();
CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
CheckError(clBuildProgram(program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
}
// Confirms whether a certain status code is an actual compilation error or warning
@ -489,28 +483,28 @@ class Program {
std::string GetBuildInfo(const Device &device) const {
auto bytes = size_t{0};
auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG};
CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes));
CheckError(clGetProgramBuildInfo(program_, device(), query, 0, nullptr, &bytes));
auto result = std::string{};
result.resize(bytes);
CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr));
CheckError(clGetProgramBuildInfo(program_, device(), query, bytes, &result[0], nullptr));
return result;
}
// Retrieves a binary or an intermediate representation of the compiled program
std::string GetIR() const {
auto bytes = size_t{0};
CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
auto result = std::string{};
result.resize(bytes);
auto result_ptr = result.data();
CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
return result;
}
// Accessor to the private data-member
const cl_program& operator()() const { return *program_; }
const cl_program& operator()() const { return program_; }
private:
std::shared_ptr<cl_program> program_;
cl_program program_ = nullptr;
};
// =================================================================================================
@ -757,13 +751,13 @@ class Kernel {
}
// Regular constructor with memory management
explicit Kernel(const Program &program, const std::string &name):
explicit Kernel(const std::shared_ptr<Program> program, const std::string &name):
kernel_(new cl_kernel, [](cl_kernel* k) {
if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
delete k;
}) {
auto status = CL_SUCCESS;
*kernel_ = clCreateKernel(program(), name.c_str(), &status);
*kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status);
CLCudaAPIError::Check(status, "clCreateKernel");
}

View File

@ -96,10 +96,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
auto binary = BinaryCache::Instance().Get(BinaryKeyRef{platform_id, precision_, routine_info, device_name },
&has_binary);
if (has_binary) {
program_ = Program(device_, context_, binary);
program_.Build(device_, options);
program_ = std::make_shared<Program>(Program(device_, context_, binary));
program_->Build(device_, options);
ProgramCache::Instance().Store(ProgramKey{ context_(), device_(), precision_, routine_info },
Program{ program_ });
std::shared_ptr<Program>{program_});
return;
}
@ -135,10 +135,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
// Store the compiled binary and program in the cache
BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name},
program_.GetIR());
program_->GetIR());
ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
Program{ program_ });
std::shared_ptr<Program>{program_});
}
// =================================================================================================

View File

@ -33,6 +33,7 @@ namespace clblast {
class Routine {
public:
// Initializes db_, fetching cached database or building one
static void InitDatabase(const Device &device, const std::vector<std::string> &kernel_names,
const Precision precision, const std::vector<database::DatabaseEntry> &userDatabase,
Databases &db) {
@ -78,9 +79,6 @@ class Routine {
// Initializes program_, fetching cached program or building one
void InitProgram(std::initializer_list<const char *> source);
// Initializes db_, fetching cached database or building one
void InitDatabase(const std::vector<database::DatabaseEntry> &userDatabase);
protected:
// Non-static variable for the precision
@ -97,7 +95,7 @@ class Routine {
const Device device_;
// Compiled program (either retrieved from cache or compiled in slow path)
Program program_;
std::shared_ptr<Program> program_;
// Connection to the database for all the device-specific parameters
Databases db_;

View File

@ -77,7 +77,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
const Program &program, const Databases &,
const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t m, const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
@ -95,26 +95,26 @@ void FillMatrix(Queue &queue, const Device &device,
}
// Compiles the above function
template void FillMatrix<half>(Queue&, const Device&, const Program&, const Databases&,
template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<half>&, const half);
template void FillMatrix<float>(Queue&, const Device&, const Program&, const Databases&,
template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<float>&, const float);
template void FillMatrix<double>(Queue&, const Device&, const Program&, const Databases&,
template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<double>&, const double);
template void FillMatrix<float2>(Queue&, const Device&, const Program&, const Databases&,
template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<float2>&, const float2);
template void FillMatrix<double2>(Queue&, const Device&, const Program&, const Databases&,
template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<double2>&, const double2);
// Sets all elements of a vector to a constant value
template <typename T>
void FillVector(Queue &queue, const Device &device,
const Program &program, const Databases &,
const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t inc, const size_t offset,
const Buffer<T> &dest,
@ -131,19 +131,19 @@ void FillVector(Queue &queue, const Device &device,
}
// Compiles the above function
template void FillVector<half>(Queue&, const Device&, const Program&, const Databases&,
template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<half>&, const half);
template void FillVector<float>(Queue&, const Device&, const Program&, const Databases&,
template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<float>&, const float);
template void FillVector<double>(Queue&, const Device&, const Program&, const Databases&,
template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<double>&, const double);
template void FillVector<float2>(Queue&, const Device&, const Program&, const Databases&,
template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<float2>&, const float2);
template void FillVector<double2>(Queue&, const Device&, const Program&, const Databases&,
template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<double2>&, const double2);

View File

@ -36,7 +36,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
const Program &program, const Databases &,
const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t m, const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
@ -45,7 +45,7 @@ void FillMatrix(Queue &queue, const Device &device,
// Sets all elements of a vector to a constant value
template <typename T>
void FillVector(Queue &queue, const Device &device,
const Program &program, const Databases &,
const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t inc, const size_t offset,
const Buffer<T> &dest,
@ -66,7 +66,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const std::shared_ptr<Program> program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false) {
@ -186,7 +186,7 @@ void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const Buffer<int> &dest_offsets,
const Buffer<T> &dest,
const Program &program, const bool do_pad,
const std::shared_ptr<Program> program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const size_t batch_count) {
@ -250,7 +250,7 @@ void PadCopyTransposeMatrixStridedBatched(Queue &queue, const Device &device,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const size_t dest_stride, const Buffer<T> &dest,
const Program &program, const bool do_pad,
const std::shared_ptr<Program> program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const size_t batch_count) {

View File

@ -21,7 +21,8 @@ namespace clblast {
// =================================================================================================
// Compiles a program from source code
Program CompileFromSource(const std::string &source_string, const Precision precision,
std::shared_ptr<Program> CompileFromSource(
const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options,
@ -93,13 +94,13 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
}
// Compiles the kernel
auto program = Program(context, kernel_string);
auto program = std::make_shared<Program>(context, kernel_string);
try {
program.Build(device, options);
program->Build(device, options);
} catch (const CLCudaAPIBuildError &e) {
if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) {
if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) {
fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
program.GetBuildInfo(device).c_str());
program->GetBuildInfo(device).c_str());
}
throw;
}

View File

@ -24,7 +24,8 @@ namespace clblast {
// =================================================================================================
// Compiles a program from source code
Program CompileFromSource(const std::string &source_string, const Precision precision,
std::shared_ptr<Program> CompileFromSource(
const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options,