diff --git a/CHANGELOG b/CHANGELOG index 5f3ef371..4ccd09ec 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,7 +7,7 @@ Development (next version) - Added support for Intel specific subgroup shuffling extensions for faster GEMM on Intel GPUs - Re-added a local memory size constraint to the tuners - Updated and reorganised the CLBlast documentation -- Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program +- Fixed incorrect releasing of the OpenCL program resulting in segfaults / access violations - Various minor fixes and enhancements - Added tuned parameters for various devices (see doc/tuning.md) - Added non-BLAS level-1 routines: diff --git a/src/cache.cpp b/src/cache.cpp index 4b74b0a1..e15a72a5 100644 --- a/src/cache.cpp +++ b/src/cache.cpp @@ -117,8 +117,8 @@ template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const; // ================================================================================================= -template class Cache; -template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const; +template class Cache>; +template std::shared_ptr ProgramCache::Get(const ProgramKeyRef &, bool *) const; template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name // ================================================================================================= diff --git a/src/cache.hpp b/src/cache.hpp index 228fbccb..89973f61 100644 --- a/src/cache.hpp +++ b/src/cache.hpp @@ -83,10 +83,10 @@ extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const typedef std::tuple ProgramKey; typedef std::tuple ProgramKeyRef; -typedef Cache ProgramCache; +typedef Cache> ProgramCache; -extern template class Cache; -extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const; +extern template class Cache>; +extern template std::shared_ptr ProgramCache::Get(const ProgramKeyRef &, bool *) const; // ================================================================================================= diff --git a/src/clpp11.hpp b/src/clpp11.hpp index c4b721b9..ce6f39cb 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -437,47 +437,41 @@ using ContextPointer = cl_context*; // C++11 version of 'cl_program'. class Program { public: - Program() = default; // Source-based constructor with memory management - explicit Program(const Context &context, const std::string &source): - program_(new cl_program, [](cl_program* p) { - #ifndef _MSC_VER // 'clReleaseProgram' caused an access violation with Visual Studio - if (*p) { CheckErrorDtor(clReleaseProgram(*p)); } - #endif - delete p; - }) { + explicit Program(const Context &context, const std::string &source) { const char *source_ptr = &source[0]; const auto length = source.length(); auto status = CL_SUCCESS; - *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status); + program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status); CLCudaAPIError::Check(status, "clCreateProgramWithSource"); } // Binary-based constructor with memory management - explicit Program(const Device &device, const Context &context, const std::string &binary): - program_(new cl_program, [](cl_program* p) { - if (*p) { CheckErrorDtor(clReleaseProgram(*p)); } - delete p; - }) { + explicit Program(const Device &device, const Context &context, const std::string &binary) { const char *binary_ptr = &binary[0]; const auto length = binary.length(); auto status1 = CL_SUCCESS; auto status2 = CL_SUCCESS; const auto dev = device(); - *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length, + program_ = clCreateProgramWithBinary(context(), 1, &dev, &length, reinterpret_cast(&binary_ptr), &status1, &status2); CLCudaAPIError::Check(status1, "clCreateProgramWithBinary (binary status)"); CLCudaAPIError::Check(status2, "clCreateProgramWithBinary"); } + // Clean-up + ~Program() { + if (program_) { CheckErrorDtor(clReleaseProgram(program_)); } + } + // Compiles the device program and checks whether or not there are any warnings/errors void Build(const Device &device, std::vector &options) { options.push_back("-cl-std=CL1.1"); auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); const cl_device_id dev = device(); - CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr)); + CheckError(clBuildProgram(program_, 1, &dev, options_string.c_str(), nullptr, nullptr)); } // Confirms whether a certain status code is an actual compilation error or warning @@ -489,28 +483,28 @@ class Program { std::string GetBuildInfo(const Device &device) const { auto bytes = size_t{0}; auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG}; - CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes)); + CheckError(clGetProgramBuildInfo(program_, device(), query, 0, nullptr, &bytes)); auto result = std::string{}; result.resize(bytes); - CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr)); + CheckError(clGetProgramBuildInfo(program_, device(), query, bytes, &result[0], nullptr)); return result; } // Retrieves a binary or an intermediate representation of the compiled program std::string GetIR() const { auto bytes = size_t{0}; - CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); + CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); auto result = std::string{}; result.resize(bytes); auto result_ptr = result.data(); - CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr)); + CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr)); return result; } // Accessor to the private data-member - const cl_program& operator()() const { return *program_; } + const cl_program& operator()() const { return program_; } private: - std::shared_ptr program_; + cl_program program_ = nullptr; }; // ================================================================================================= @@ -757,13 +751,13 @@ class Kernel { } // Regular constructor with memory management - explicit Kernel(const Program &program, const std::string &name): + explicit Kernel(const std::shared_ptr program, const std::string &name): kernel_(new cl_kernel, [](cl_kernel* k) { if (*k) { CheckErrorDtor(clReleaseKernel(*k)); } delete k; }) { auto status = CL_SUCCESS; - *kernel_ = clCreateKernel(program(), name.c_str(), &status); + *kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status); CLCudaAPIError::Check(status, "clCreateKernel"); } diff --git a/src/routine.cpp b/src/routine.cpp index fa5934f6..4caa4d7b 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -96,10 +96,10 @@ void Routine::InitProgram(std::initializer_list source) { auto binary = BinaryCache::Instance().Get(BinaryKeyRef{platform_id, precision_, routine_info, device_name }, &has_binary); if (has_binary) { - program_ = Program(device_, context_, binary); - program_.Build(device_, options); + program_ = std::make_shared(Program(device_, context_, binary)); + program_->Build(device_, options); ProgramCache::Instance().Store(ProgramKey{ context_(), device_(), precision_, routine_info }, - Program{ program_ }); + std::shared_ptr{program_}); return; } @@ -135,10 +135,10 @@ void Routine::InitProgram(std::initializer_list source) { // Store the compiled binary and program in the cache BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name}, - program_.GetIR()); + program_->GetIR()); ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info}, - Program{ program_ }); + std::shared_ptr{program_}); } // ================================================================================================= diff --git a/src/routine.hpp b/src/routine.hpp index 00f7b5cc..8db5e5a9 100644 --- a/src/routine.hpp +++ b/src/routine.hpp @@ -33,6 +33,7 @@ namespace clblast { class Routine { public: + // Initializes db_, fetching cached database or building one static void InitDatabase(const Device &device, const std::vector &kernel_names, const Precision precision, const std::vector &userDatabase, Databases &db) { @@ -78,9 +79,6 @@ class Routine { // Initializes program_, fetching cached program or building one void InitProgram(std::initializer_list source); - // Initializes db_, fetching cached database or building one - void InitDatabase(const std::vector &userDatabase); - protected: // Non-static variable for the precision @@ -97,7 +95,7 @@ class Routine { const Device device_; // Compiled program (either retrieved from cache or compiled in slow path) - Program program_; + std::shared_ptr program_; // Connection to the database for all the device-specific parameters Databases db_; diff --git a/src/routines/common.cpp b/src/routines/common.cpp index a4d1f577..5b80e3f2 100644 --- a/src/routines/common.cpp +++ b/src/routines/common.cpp @@ -77,7 +77,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device, // Sets all elements of a matrix to a constant value template void FillMatrix(Queue &queue, const Device &device, - const Program &program, const Databases &, + const std::shared_ptr program, const Databases &, EventPointer event, const std::vector &waitForEvents, const size_t m, const size_t n, const size_t ld, const size_t offset, const Buffer &dest, @@ -95,26 +95,26 @@ void FillMatrix(Queue &queue, const Device &device, } // Compiles the above function -template void FillMatrix(Queue&, const Device&, const Program&, const Databases&, +template void FillMatrix(Queue&, const Device&, const std::shared_ptr, const Databases&, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const half); -template void FillMatrix(Queue&, const Device&, const Program&, const Databases&, +template void FillMatrix(Queue&, const Device&, const std::shared_ptr, const Databases&, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const float); -template void FillMatrix(Queue&, const Device&, const Program&, const Databases&, +template void FillMatrix(Queue&, const Device&, const std::shared_ptr, const Databases&, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const double); -template void FillMatrix(Queue&, const Device&, const Program&, const Databases&, +template void FillMatrix(Queue&, const Device&, const std::shared_ptr, const Databases&, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const float2); -template void FillMatrix(Queue&, const Device&, const Program&, const Databases&, +template void FillMatrix(Queue&, const Device&, const std::shared_ptr, const Databases&, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const size_t, const Buffer&, const double2); // Sets all elements of a vector to a constant value template void FillVector(Queue &queue, const Device &device, - const Program &program, const Databases &, + const std::shared_ptr program, const Databases &, EventPointer event, const std::vector &waitForEvents, const size_t n, const size_t inc, const size_t offset, const Buffer &dest, @@ -131,19 +131,19 @@ void FillVector(Queue &queue, const Device &device, } // Compiles the above function -template void FillVector(Queue&, const Device&, const Program&, const Databases&, +template void FillVector(Queue&, const Device&, const std::shared_ptr, const Databases&, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const half); -template void FillVector(Queue&, const Device&, const Program&, const Databases&, +template void FillVector(Queue&, const Device&, const std::shared_ptr, const Databases&, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const float); -template void FillVector(Queue&, const Device&, const Program&, const Databases&, +template void FillVector(Queue&, const Device&, const std::shared_ptr, const Databases&, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const double); -template void FillVector(Queue&, const Device&, const Program&, const Databases&, +template void FillVector(Queue&, const Device&, const std::shared_ptr, const Databases&, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const float2); -template void FillVector(Queue&, const Device&, const Program&, const Databases&, +template void FillVector(Queue&, const Device&, const std::shared_ptr, const Databases&, EventPointer, const std::vector&, const size_t, const size_t, const size_t, const Buffer&, const double2); diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 6cbe1e1b..b909243d 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -36,7 +36,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device, // Sets all elements of a matrix to a constant value template void FillMatrix(Queue &queue, const Device &device, - const Program &program, const Databases &, + const std::shared_ptr program, const Databases &, EventPointer event, const std::vector &waitForEvents, const size_t m, const size_t n, const size_t ld, const size_t offset, const Buffer &dest, @@ -45,7 +45,7 @@ void FillMatrix(Queue &queue, const Device &device, // Sets all elements of a vector to a constant value template void FillVector(Queue &queue, const Device &device, - const Program &program, const Databases &, + const std::shared_ptr program, const Databases &, EventPointer event, const std::vector &waitForEvents, const size_t n, const size_t inc, const size_t offset, const Buffer &dest, @@ -66,7 +66,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, const size_t dest_ld, const size_t dest_offset, const Buffer &dest, const T alpha, - const Program &program, const bool do_pad, + const std::shared_ptr program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const bool upper = false, const bool lower = false, const bool diagonal_imag_zero = false) { @@ -186,7 +186,7 @@ void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const Buffer &dest_offsets, const Buffer &dest, - const Program &program, const bool do_pad, + const std::shared_ptr program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const size_t batch_count) { @@ -250,7 +250,7 @@ void PadCopyTransposeMatrixStridedBatched(Queue &queue, const Device &device, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const size_t dest_stride, const Buffer &dest, - const Program &program, const bool do_pad, + const std::shared_ptr program, const bool do_pad, const bool do_transpose, const bool do_conjugate, const size_t batch_count) { diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index 65131cca..05c29944 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -21,7 +21,8 @@ namespace clblast { // ================================================================================================= // Compiles a program from source code -Program CompileFromSource(const std::string &source_string, const Precision precision, +std::shared_ptr CompileFromSource( + const std::string &source_string, const Precision precision, const std::string &routine_name, const Device& device, const Context& context, std::vector& options, @@ -93,13 +94,13 @@ Program CompileFromSource(const std::string &source_string, const Precision prec } // Compiles the kernel - auto program = Program(context, kernel_string); + auto program = std::make_shared(context, kernel_string); try { - program.Build(device, options); + program->Build(device, options); } catch (const CLCudaAPIBuildError &e) { - if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) { + if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) { fprintf(stdout, "OpenCL compiler error/warning:\n%s\n", - program.GetBuildInfo(device).c_str()); + program->GetBuildInfo(device).c_str()); } throw; } diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp index 1b4f4a7a..13e8c363 100644 --- a/src/utilities/compile.hpp +++ b/src/utilities/compile.hpp @@ -24,7 +24,8 @@ namespace clblast { // ================================================================================================= // Compiles a program from source code -Program CompileFromSource(const std::string &source_string, const Precision precision, +std::shared_ptr CompileFromSource( + const std::string &source_string, const Precision precision, const std::string &routine_name, const Device& device, const Context& context, std::vector& options,