First version of the tuning API, added interface for copy-kernel, added sample

View File

@ -210,7 +210,7 @@ set(PRECISIONS 32 64 3232 6464 16)
# Sample programs
set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched dtrsm)
set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched dtrsm tuning_api)
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
@ -235,6 +235,8 @@ set(SOURCES
src/routines/levelx/xinvert.cpp # only source, don't include it as a test
set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio
@ -258,6 +260,9 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual
set(SOURCES ${SOURCES} src/clblast.cpp src/clblast_c.cpp)
@ -295,6 +300,9 @@ foreach(DATABASE ${DATABASES})
set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}/${DATABASE}_3232.hpp)
set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}/${DATABASE}_6464.hpp)
foreach(KERNEL ${KERNELS})
set(HEADERS ${HEADERS} src/tuning/kernels/${KERNEL}.hpp)
# Creates and links the library

View File

@ -705,6 +705,12 @@ StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::s
// =================================================================================================
template <typename T>
StatusCode PUBLIC_API TuneCopyMatrixFast(cl_command_queue* queue, const size_t m, const size_t n,
const double fraction, std::unordered_map<std::string,size_t> &parameters);
// =================================================================================================
} // namespace clblast

View File

@ -0,0 +1,77 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
// Author(s):
// Cedric Nugteren <>
// This file demonstrates the use of the runtime tuning API. It is a stand-alone example, but it
// does require the Khronos C++ OpenCL API header file (downloaded by CMake).
// =================================================================================================
#include <cstdio>
#include <chrono>
#include <vector>
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
// Includes the C++ OpenCL API. If not yet available, it can be found here:
#include "cl.hpp"
// Includes the CLBlast library
#include <clblast.h>
// =================================================================================================
int main() {
// OpenCL platform/device settings
const auto platform_id = 0;
const auto device_id = 0;
// Example arguments
const size_t m = 128;
const size_t n = 64;
const auto fraction = 1.0; // between 0.0 and 1.0
// Initializes the OpenCL platform
auto platforms = std::vector<cl::Platform>();
if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
auto platform = platforms[platform_id];
// Initializes the OpenCL device
auto devices = std::vector<cl::Device>();
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
auto device = devices[device_id];
// Creates the OpenCL context, queue, and an event
auto device_as_vector = std::vector<cl::Device>{device};
auto context = cl::Context(device_as_vector);
auto queue = cl::CommandQueue(context, device);
// Performs the tuning
printf("Starting the tuning...\n");
std::unordered_map<std::string,size_t> parameters;
auto queue_plain = queue();
auto status = clblast::TuneCopyMatrixFast<float>(&queue_plain, m, n, fraction, parameters);
// Tuning completed. See "clblast.h" for status codes (0 -> success).
printf("Completed TuneCopyMatrixFast with status %d (0 == OK), found parameters:\n", static_cast<int>(status));
for (const auto parameter: parameters) {
printf("> %s = %zu\n", parameter.first.c_str(), parameter.second);
// Set the new parameters
status = clblast::OverrideParameters(device(), "Copy", clblast::Precision::kSingle, parameters);
printf("Completed OverrideParameters with status %d (0 == OK)\n", static_cast<int>(status));
return 0;
// =================================================================================================

View File

@ -121,6 +121,16 @@ void Tuner(int argc, char* argv[], const int V,
SetConstraintsFunc SetConstraints,
SetArgumentsFunc<T> SetArguments);
// Function to run the tuners through the CLBlast API, no I/O
template <typename T>
StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
const GetTunerDefaultsFunc GetTunerDefaults,
const GetTunerSettingsFunc<T> GetTunerSettings,
const TestValidArgumentsFunc<T> TestValidArguments,
const SetConstraintsFunc SetConstraints,
const SetArgumentsFunc<T> SetArguments,
std::unordered_map<std::string,size_t> &parameters);
// =================================================================================================
} // namespace clblast

View File

@ -0,0 +1,232 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
// Author(s):
// Cedric Nugteren <>
// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune).
// This is only used for the optional tuner binaries and not part of the core of CLBlast.
// =================================================================================================
#include <vector>
#include <string>
#include <random>
#include <utility>
#include <algorithm>
#include <cstdio>
#include "tuning/tuning.hpp"
#include "tuning/kernels/copy_fast.hpp"
namespace clblast {
// =================================================================================================
template <typename T>
StatusCode TuneCopyMatrixFast(RawCommandQueue * queue, const size_t m, const size_t n,
const double fraction, std::unordered_map<std::string,size_t> &parameters) {
auto args = Arguments<T>();
args.m = m;
args.n = n;
args.fraction = fraction;
auto queue_cpp = Queue(*queue);
return TunerAPI<T>(queue_cpp, args, 0, GetTunerDefaults, GetTunerSettings<T>,
TestValidArguments<T>, SetConstraints, SetArguments<T>, parameters);
// Compiles the above
template StatusCode TuneCopyMatrixFast<half>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&);
template StatusCode TuneCopyMatrixFast<float>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&);
template StatusCode TuneCopyMatrixFast<double>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&);
template StatusCode TuneCopyMatrixFast<float2>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&);
template StatusCode TuneCopyMatrixFast<double2>(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map<std::string,size_t>&);
// =================================================================================================
// The main tuner API, similar to the one in tuning.cpp, but without I/O
template <typename T>
StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
const GetTunerDefaultsFunc GetTunerDefaults,
const GetTunerSettingsFunc<T> GetTunerSettings,
const TestValidArgumentsFunc<T> TestValidArguments,
const SetConstraintsFunc SetConstraints,
const SetArgumentsFunc<T> SetArguments,
std::unordered_map<std::string,size_t> &parameters) {
// Sets the parameters and platform/device for which to tune (command-line options)
const TunerDefaults defaults = GetTunerDefaults(V);
const TunerSettings settings = GetTunerSettings(V, args);
// Tests validity of the given arguments
TestValidArguments(V, args);
// Retrieves OpenCL classes
const auto device = queue.GetDevice();
const auto context = queue.GetContext();
// Inspects whether or not FP64 is supported in case of double precision
if ((PrecisionValue<T>() == Precision::kDouble && !PrecisionSupported<double>(device)) ||
(PrecisionValue<T>() == Precision::kComplexDouble && !PrecisionSupported<double2>(device))) {
return StatusCode::kNoDoublePrecision;
// As above, but for FP16 (half precision)
if (PrecisionValue<T>() == Precision::kHalf && !PrecisionSupported<half>(device)) {
return StatusCode::kNoHalfPrecision;
// Retrieves properties
const auto device_type = GetDeviceType(device);
const auto device_vendor = GetDeviceVendor(device);
const auto device_architecture = GetDeviceArchitecture(device);
const auto device_name = GetDeviceName(device);
// Creates input buffers with random data
const auto buffer_sizes = std::vector<size_t>{
settings.size_x, settings.size_y,
settings.size_a, settings.size_b, settings.size_c,
const auto seed = static_cast<unsigned long>(time(nullptr));
std::mt19937 mt(seed);
std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
auto source_buffers = std::vector<std::vector<T>>();
auto reference_buffers = std::vector<std::vector<T>>();
auto result_buffers = std::vector<std::vector<T>>();
auto device_buffers = std::vector<Buffer<T>>();
for (const auto size : buffer_sizes) {
auto host_buffer = std::vector<T>(size);
PopulateVector(host_buffer, mt, dist);
device_buffers.push_back(Buffer<T>(context, size));
// Sets the tunable parameters and their possible values
auto configurations = SetConfigurations(settings.parameters, SetConstraints(V));
// Select the search method (full search or a random fraction)
if (args.fraction != 0.0 && args.fraction != 1.0) {
const auto new_size = static_cast<size_t>(configurations.size() * args.fraction);
auto rng = std::default_random_engine{};
std::shuffle(std::begin(configurations), std::end(configurations), rng);
// First runs a reference example to compare against
try {
// Sets the input
for (const auto id : settings.inputs) {
device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
// Compiles the kernel
auto compiler_options = std::vector<std::string>();
const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name,
device, context, compiler_options, 0);
auto kernel = Kernel(program, settings.kernel_name);
SetArguments(V, kernel, args, device_buffers);
// Runs the kernel
const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device,
settings.global_size_ref, settings.local_size_ref, true);
if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); }
// Saves the result
for (const auto id : settings.outputs) {
device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]);
catch (...) {
const auto status_code = DispatchExceptionCatchAll(true);
return status_code;
// Starts the tuning process
auto results = std::vector<TuningResult>();
for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) {
try {
auto configuration = configurations[config_id];
// Sets the input
for (const auto id : settings.inputs) {
device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
// Sets the thread configuration
const auto global = SetThreadConfiguration(configuration, settings.global_size,
settings.mul_global, settings.div_global);
const auto local = SetThreadConfiguration(configuration, settings.local_size,
settings.mul_local, settings.div_local);
// Sets the parameters for this configuration
auto kernel_source = std::string{""};
for (const auto &parameter : configuration) {
kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n";
kernel_source += settings.sources;
// Compiles the kernel
auto compiler_options = std::vector<std::string>();
const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name,
device, context, compiler_options, 0, true);
auto kernel = Kernel(program, settings.kernel_name);
// Runs the kernel
SetArguments(V, kernel, args, device_buffers);
const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local, true);
// Kernel run was not successful
if (time_ms == -1.0) {
// Compares the results
auto l2_error = 0.0;
for (const auto id : settings.outputs) {
device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]);
for (auto index = size_t{0}; index<buffer_sizes[id]; ++index) {
const auto diff = SquaredDifference(result_buffers[id][index], reference_buffers[id][index]);
l2_error += diff;
l2_error /= static_cast<double>(buffer_sizes[id]);
if (std::isnan(l2_error) || l2_error > 1.0e-4) {
throw std::runtime_error("L2 error too large");
results.push_back(TuningResult{settings.kernel_name, time_ms, configuration});
catch (...) {
// Completed the tuning process
if (results.size() == 0) { return StatusCode::kUnexpectedError; }
// Computes the best results
auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; };
const auto best_configuration = std::min_element(results.begin(), results.end(), comparison);
const auto best_time_ms = best_configuration->score;
if (best_time_ms == 0.0) { return StatusCode::kUnexpectedError; }
// Stores the best parameters
for (const auto config : best_configuration->config) {
parameters[config.first] = config.second;
return StatusCode::kSuccess;
// Compiles the above function
template StatusCode TunerAPI<half>(Queue &queue, const Arguments<half> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<half> GetTunerSettings, const TestValidArgumentsFunc<half> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<half> SetArguments, std::unordered_map<std::string,size_t>&);
template StatusCode TunerAPI<float>(Queue &queue, const Arguments<float> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<float> GetTunerSettings, const TestValidArgumentsFunc<float> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<float> SetArguments, std::unordered_map<std::string,size_t>&);
template StatusCode TunerAPI<double>(Queue &queue, const Arguments<double> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<double> GetTunerSettings, const TestValidArgumentsFunc<double> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<double> SetArguments, std::unordered_map<std::string,size_t>&);
template StatusCode TunerAPI<float2>(Queue &queue, const Arguments<float2> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<float2> GetTunerSettings, const TestValidArgumentsFunc<float2> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<float2> SetArguments, std::unordered_map<std::string,size_t>&);
template StatusCode TunerAPI<double2>(Queue &queue, const Arguments<double2> &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc<double2> GetTunerSettings, const TestValidArgumentsFunc<double2> TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc<double2> SetArguments, std::unordered_map<std::string,size_t>&);
// =================================================================================================
} // namespace clblast

View File

@ -62,15 +62,16 @@ double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const
double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local) {
std::vector<size_t> global, const std::vector<size_t> &local,
const bool silent) {
try {
const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local);
printf(" %9.2lf ms |", time_ms);
if (!silent) { printf(" %9.2lf ms |", time_ms); }
return time_ms;
catch (...) {
const auto status_code = DispatchExceptionCatchAll(true);
printf(" error %-5d |", static_cast<int>(status_code));
if (!silent) { printf(" error %-5d |", static_cast<int>(status_code)); }
return -1.0; // invalid

View File

@ -44,7 +44,8 @@ double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const
std::vector<size_t> global, const std::vector<size_t> &local);
double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local);
std::vector<size_t> global, const std::vector<size_t> &local,
const bool silent = false);
// =================================================================================================