mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-08-21 04:22:27 +02:00
Merge pull request #4 from CNugteren/host_code_perf
Host-code performance
This commit is contained in:
commit
d7097a063a
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
Development version (next release)
|
Development version (next release)
|
||||||
- Added support for complex conjugate transpose
|
- Added support for complex conjugate transpose
|
||||||
|
- Some host-code performance improvements
|
||||||
- Added level-2 routines:
|
- Added level-2 routines:
|
||||||
SGEMV/DGEMV/CGEMV/ZGEMV
|
SGEMV/DGEMV/CGEMV/ZGEMV
|
||||||
- Added level-3 routines:
|
- Added level-3 routines:
|
||||||
|
|
|
@ -134,8 +134,7 @@ class Platform: public Object {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Accessors to the private data-member
|
// Accessors to the private data-member
|
||||||
cl_platform_id operator()() const { return platform_; }
|
const cl_platform_id& operator()() const { return platform_; }
|
||||||
cl_platform_id& operator()() { return platform_; }
|
|
||||||
private:
|
private:
|
||||||
cl_platform_id platform_;
|
cl_platform_id platform_;
|
||||||
};
|
};
|
||||||
|
@ -193,8 +192,7 @@ class Device: public Object {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Accessors to the private data-member
|
// Accessors to the private data-member
|
||||||
cl_device_id operator()() const { return device_; }
|
const cl_device_id& operator()() const { return device_; }
|
||||||
cl_device_id& operator()() { return device_; }
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
// Helper functions
|
// Helper functions
|
||||||
|
@ -259,8 +257,7 @@ class Context: public ObjectWithState {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Accessors to the private data-member
|
// Accessors to the private data-member
|
||||||
cl_context operator()() const { return context_; }
|
const cl_context& operator()() const { return context_; }
|
||||||
cl_context& operator()() { return context_; }
|
|
||||||
private:
|
private:
|
||||||
cl_context context_;
|
cl_context context_;
|
||||||
};
|
};
|
||||||
|
@ -296,16 +293,6 @@ class Program: public ObjectWithState {
|
||||||
swap(*this, other);
|
swap(*this, other);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
TODO: Implement move construction/assignment?
|
|
||||||
Program(Program &&other) {
|
|
||||||
clRetainProgram(program_);
|
|
||||||
swap(*this, other);
|
|
||||||
}
|
|
||||||
Program& operator=(Program &&other) {
|
|
||||||
swap(*this, other);
|
|
||||||
return *this;
|
|
||||||
}*/
|
|
||||||
friend void swap(Program &first, Program &second) {
|
friend void swap(Program &first, Program &second) {
|
||||||
std::swap(first.length_, second.length_);
|
std::swap(first.length_, second.length_);
|
||||||
std::swap(first.source_, second.source_);
|
std::swap(first.source_, second.source_);
|
||||||
|
@ -327,8 +314,7 @@ class Program: public ObjectWithState {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Accessors to the private data-member
|
// Accessors to the private data-member
|
||||||
cl_program operator()() const { return program_; }
|
const cl_program& operator()() const { return program_; }
|
||||||
cl_program& operator()() { return program_; }
|
|
||||||
private:
|
private:
|
||||||
size_t length_;
|
size_t length_;
|
||||||
std::vector<char> source_;
|
std::vector<char> source_;
|
||||||
|
@ -382,8 +368,7 @@ class Kernel: public ObjectWithState {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Accessors to the private data-member
|
// Accessors to the private data-member
|
||||||
cl_kernel operator()() const { return kernel_; }
|
const cl_kernel& operator()() const { return kernel_; }
|
||||||
cl_kernel& operator()() { return kernel_; }
|
|
||||||
private:
|
private:
|
||||||
cl_kernel kernel_;
|
cl_kernel kernel_;
|
||||||
};
|
};
|
||||||
|
@ -445,8 +430,7 @@ class CommandQueue: public ObjectWithState {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Accessors to the private data-member
|
// Accessors to the private data-member
|
||||||
cl_command_queue operator()() const { return queue_; }
|
const cl_command_queue& operator()() const { return queue_; }
|
||||||
cl_command_queue& operator()() { return queue_; }
|
|
||||||
private:
|
private:
|
||||||
cl_command_queue queue_;
|
cl_command_queue queue_;
|
||||||
};
|
};
|
||||||
|
@ -511,8 +495,7 @@ class Buffer: public ObjectWithState {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Accessors to the private data-member
|
// Accessors to the private data-member
|
||||||
cl_mem operator()() const { return buffer_; }
|
const cl_mem& operator()() const { return buffer_; }
|
||||||
cl_mem& operator()() { return buffer_; }
|
|
||||||
private:
|
private:
|
||||||
cl_mem buffer_;
|
cl_mem buffer_;
|
||||||
};
|
};
|
||||||
|
|
|
@ -33,7 +33,7 @@ class Routine {
|
||||||
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
|
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
|
||||||
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
|
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
|
||||||
|
|
||||||
// New data-type:tThe cache of compiled OpenCL programs, along with some meta-data
|
// The cache of compiled OpenCL programs, along with some meta-data
|
||||||
struct ProgramCache {
|
struct ProgramCache {
|
||||||
Program program;
|
Program program;
|
||||||
std::string device_name;
|
std::string device_name;
|
||||||
|
@ -101,7 +101,7 @@ class Routine {
|
||||||
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
|
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
|
||||||
// The first assumes that the program is available in the cache and will throw an exception
|
// The first assumes that the program is available in the cache and will throw an exception
|
||||||
// otherwise.
|
// otherwise.
|
||||||
Program GetProgramFromCache() const;
|
const Program& GetProgramFromCache() const;
|
||||||
bool ProgramIsInCache() const;
|
bool ProgramIsInCache() const;
|
||||||
|
|
||||||
// Non-static variable for the precision. Note that the same variable (but static) might exist in
|
// Non-static variable for the precision. Note that the same variable (but static) might exist in
|
||||||
|
|
|
@ -308,7 +308,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
|
||||||
|
|
||||||
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
|
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
|
||||||
// otherwise.
|
// otherwise.
|
||||||
Program Routine::GetProgramFromCache() const {
|
const Program& Routine::GetProgramFromCache() const {
|
||||||
for (auto &cached_program: program_cache_) {
|
for (auto &cached_program: program_cache_) {
|
||||||
if (cached_program.MatchInCache(device_name_, precision_, routines_)) {
|
if (cached_program.MatchInCache(device_name_, precision_, routines_)) {
|
||||||
return cached_program.program;
|
return cached_program.program;
|
||||||
|
|
|
@ -60,7 +60,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
|
||||||
|
|
||||||
// Retrieves the Xaxpy kernel from the compiled binary
|
// Retrieves the Xaxpy kernel from the compiled binary
|
||||||
try {
|
try {
|
||||||
auto program = GetProgramFromCache();
|
auto& program = GetProgramFromCache();
|
||||||
auto kernel = Kernel(program, kernel_name);
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
// Sets the kernel arguments
|
// Sets the kernel arguments
|
||||||
|
|
|
@ -102,7 +102,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
||||||
auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
|
auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
|
||||||
|
|
||||||
// Loads the program from the database
|
// Loads the program from the database
|
||||||
auto program = GetProgramFromCache();
|
auto& program = GetProgramFromCache();
|
||||||
|
|
||||||
// Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
|
// Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
|
||||||
// them up until they reach a certain multiple of size (kernel parameter dependent).
|
// them up until they reach a certain multiple of size (kernel parameter dependent).
|
||||||
|
|
|
@ -100,7 +100,7 @@ StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
|
||||||
|
|
||||||
// Retrieves the Xgemv kernel from the compiled binary
|
// Retrieves the Xgemv kernel from the compiled binary
|
||||||
try {
|
try {
|
||||||
auto program = GetProgramFromCache();
|
auto& program = GetProgramFromCache();
|
||||||
auto kernel = Kernel(program, kernel_name);
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
// Sets the kernel arguments
|
// Sets the kernel arguments
|
||||||
|
|
|
@ -61,7 +61,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
|
||||||
// Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
|
// Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
|
||||||
// routine afterwards
|
// routine afterwards
|
||||||
try {
|
try {
|
||||||
auto program = GetProgramFromCache();
|
auto& program = GetProgramFromCache();
|
||||||
auto kernel = Kernel(program, kernel_name);
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
// Sets the arguments for the symmetric-to-squared kernel
|
// Sets the arguments for the symmetric-to-squared kernel
|
||||||
|
|
Loading…
Reference in a new issue