From 0e1a1520233934e1d11bf2c32fc909617ff751ac Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 6 Mar 2018 20:52:12 +0100 Subject: [PATCH] First version of the tuning API, added interface for copy-kernel, added sample --- CMakeLists.txt | 10 +- include/clblast.h | 6 + samples/tuning_api.cpp | 77 +++++++++++++ src/tuning/tuning.hpp | 10 ++ src/tuning/tuning_api.cpp | 232 ++++++++++++++++++++++++++++++++++++++ src/utilities/timing.cpp | 7 +- src/utilities/timing.hpp | 3 +- 7 files changed, 340 insertions(+), 5 deletions(-) create mode 100644 samples/tuning_api.cpp create mode 100644 src/tuning/tuning_api.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e685d76..eb04287e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -210,7 +210,7 @@ set(PRECISIONS 32 64 3232 6464 16) # Sample programs if(OPENCL) - set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched dtrsm) + set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched dtrsm tuning_api) set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) if(NETLIB) set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib) @@ -235,6 +235,8 @@ set(SOURCES src/kernel_preprocessor.cpp src/routine.cpp src/routines/levelx/xinvert.cpp # only source, don't include it as a test + src/tuning/configurations.cpp + src/tuning/tuning_api.cpp ) set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio include/clblast_half.h @@ -258,6 +260,9 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual src/kernel_preprocessor.hpp src/cxpp11_common.hpp src/routine.hpp + src/tuning/configurations.hpp + src/tuning/tuning.hpp + src/tuning/routines/routine_tuner.hpp ) if(OPENCL) set(SOURCES ${SOURCES} src/clblast.cpp src/clblast_c.cpp) @@ -295,6 +300,9 @@ foreach(DATABASE ${DATABASES}) set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}/${DATABASE}_3232.hpp) set(HEADERS ${HEADERS} src/database/kernels/${DATABASE}/${DATABASE}_6464.hpp) endforeach() +foreach(KERNEL ${KERNELS}) + set(HEADERS ${HEADERS} src/tuning/kernels/${KERNEL}.hpp) +endforeach() # Creates and links the library if(BUILD_SHARED_LIBS) diff --git a/include/clblast.h b/include/clblast.h index 9d3b9ea0..d6118e19 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -705,6 +705,12 @@ StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::s // ================================================================================================= +template +StatusCode PUBLIC_API TuneCopyMatrixFast(cl_command_queue* queue, const size_t m, const size_t n, + const double fraction, std::unordered_map ¶meters); + +// ================================================================================================= + } // namespace clblast // CLBLAST_CLBLAST_H_ diff --git a/samples/tuning_api.cpp b/samples/tuning_api.cpp new file mode 100644 index 00000000..3c9ba876 --- /dev/null +++ b/samples/tuning_api.cpp @@ -0,0 +1,77 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file demonstrates the use of the runtime tuning API. It is a stand-alone example, but it +// does require the Khronos C++ OpenCL API header file (downloaded by CMake). +// +// ================================================================================================= + +#include +#include +#include + +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the C++ OpenCL API. If not yet available, it can be found here: +// https://www.khronos.org/registry/cl/api/1.1/cl.hpp +#include "cl.hpp" + +// Includes the CLBlast library +#include + +// ================================================================================================= + +int main() { + + // OpenCL platform/device settings + const auto platform_id = 0; + const auto device_id = 0; + + // Example arguments + const size_t m = 128; + const size_t n = 64; + const auto fraction = 1.0; // between 0.0 and 1.0 + + // Initializes the OpenCL platform + auto platforms = std::vector(); + cl::Platform::get(&platforms); + if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } + auto platform = platforms[platform_id]; + + // Initializes the OpenCL device + auto devices = std::vector(); + platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); + if (devices.size() == 0 || device_id >= devices.size()) { return 1; } + auto device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + auto device_as_vector = std::vector{device}; + auto context = cl::Context(device_as_vector); + auto queue = cl::CommandQueue(context, device); + + // Performs the tuning + printf("Starting the tuning...\n"); + std::unordered_map parameters; + auto queue_plain = queue(); + auto status = clblast::TuneCopyMatrixFast(&queue_plain, m, n, fraction, parameters); + + // Tuning completed. See "clblast.h" for status codes (0 -> success). + printf("Completed TuneCopyMatrixFast with status %d (0 == OK), found parameters:\n", static_cast(status)); + for (const auto parameter: parameters) { + printf("> %s = %zu\n", parameter.first.c_str(), parameter.second); + } + + // Set the new parameters + status = clblast::OverrideParameters(device(), "Copy", clblast::Precision::kSingle, parameters); + printf("Completed OverrideParameters with status %d (0 == OK)\n", static_cast(status)); + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index ee7e0087..cbecc300 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -121,6 +121,16 @@ void Tuner(int argc, char* argv[], const int V, SetConstraintsFunc SetConstraints, SetArgumentsFunc SetArguments); +// Function to run the tuners through the CLBlast API, no I/O +template +StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, + const GetTunerDefaultsFunc GetTunerDefaults, + const GetTunerSettingsFunc GetTunerSettings, + const TestValidArgumentsFunc TestValidArguments, + const SetConstraintsFunc SetConstraints, + const SetArgumentsFunc SetArguments, + std::unordered_map ¶meters); + // ================================================================================================= } // namespace clblast diff --git a/src/tuning/tuning_api.cpp b/src/tuning/tuning_api.cpp new file mode 100644 index 00000000..94a9a367 --- /dev/null +++ b/src/tuning/tuning_api.cpp @@ -0,0 +1,232 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements the parameter configurations for the CLBlast auto-tuner (taken from CLTune). +// This is only used for the optional tuner binaries and not part of the core of CLBlast. +// +// ================================================================================================= + +#include +#include +#include +#include +#include +#include + +#include "tuning/tuning.hpp" +#include "tuning/kernels/copy_fast.hpp" + +namespace clblast { +// ================================================================================================= + +template +StatusCode TuneCopyMatrixFast(RawCommandQueue * queue, const size_t m, const size_t n, + const double fraction, std::unordered_map ¶meters) { + auto args = Arguments(); + args.m = m; + args.n = n; + args.fraction = fraction; + auto queue_cpp = Queue(*queue); + return TunerAPI(queue_cpp, args, 0, GetTunerDefaults, GetTunerSettings, + TestValidArguments, SetConstraints, SetArguments, parameters); +} + +// Compiles the above +template StatusCode TuneCopyMatrixFast(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); +template StatusCode TuneCopyMatrixFast(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); +template StatusCode TuneCopyMatrixFast(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); +template StatusCode TuneCopyMatrixFast(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); +template StatusCode TuneCopyMatrixFast(RawCommandQueue*, const size_t, const size_t, const double, std::unordered_map&); + +// ================================================================================================= + +// The main tuner API, similar to the one in tuning.cpp, but without I/O +template +StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, + const GetTunerDefaultsFunc GetTunerDefaults, + const GetTunerSettingsFunc GetTunerSettings, + const TestValidArgumentsFunc TestValidArguments, + const SetConstraintsFunc SetConstraints, + const SetArgumentsFunc SetArguments, + std::unordered_map ¶meters) { + + // Sets the parameters and platform/device for which to tune (command-line options) + const TunerDefaults defaults = GetTunerDefaults(V); + const TunerSettings settings = GetTunerSettings(V, args); + + // Tests validity of the given arguments + TestValidArguments(V, args); + + // Retrieves OpenCL classes + const auto device = queue.GetDevice(); + const auto context = queue.GetContext(); + + // Inspects whether or not FP64 is supported in case of double precision + if ((PrecisionValue() == Precision::kDouble && !PrecisionSupported(device)) || + (PrecisionValue() == Precision::kComplexDouble && !PrecisionSupported(device))) { + return StatusCode::kNoDoublePrecision; + } + + // As above, but for FP16 (half precision) + if (PrecisionValue() == Precision::kHalf && !PrecisionSupported(device)) { + return StatusCode::kNoHalfPrecision; + } + + // Retrieves properties + const auto device_type = GetDeviceType(device); + const auto device_vendor = GetDeviceVendor(device); + const auto device_architecture = GetDeviceArchitecture(device); + const auto device_name = GetDeviceName(device); + + // Creates input buffers with random data + const auto buffer_sizes = std::vector{ + settings.size_x, settings.size_y, + settings.size_a, settings.size_b, settings.size_c, + settings.size_temp + }; + const auto seed = static_cast(time(nullptr)); + std::mt19937 mt(seed); + std::uniform_real_distribution dist(kTestDataLowerLimit, kTestDataUpperLimit); + auto source_buffers = std::vector>(); + auto reference_buffers = std::vector>(); + auto result_buffers = std::vector>(); + auto device_buffers = std::vector>(); + for (const auto size : buffer_sizes) { + auto host_buffer = std::vector(size); + PopulateVector(host_buffer, mt, dist); + source_buffers.push_back(host_buffer); + reference_buffers.push_back(std::vector(size)); + result_buffers.push_back(std::vector(size)); + device_buffers.push_back(Buffer(context, size)); + } + + // Sets the tunable parameters and their possible values + auto configurations = SetConfigurations(settings.parameters, SetConstraints(V)); + + // Select the search method (full search or a random fraction) + if (args.fraction != 0.0 && args.fraction != 1.0) { + const auto new_size = static_cast(configurations.size() * args.fraction); + auto rng = std::default_random_engine{}; + std::shuffle(std::begin(configurations), std::end(configurations), rng); + configurations.resize(new_size); + } + + // First runs a reference example to compare against + try { + + // Sets the input + for (const auto id : settings.inputs) { + device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); + } + + // Compiles the kernel + auto compiler_options = std::vector(); + const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name, + device, context, compiler_options, 0); + auto kernel = Kernel(program, settings.kernel_name); + SetArguments(V, kernel, args, device_buffers); + + // Runs the kernel + const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, + settings.global_size_ref, settings.local_size_ref, true); + if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); } + + // Saves the result + for (const auto id : settings.outputs) { + device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]); + } + } + catch (...) { + const auto status_code = DispatchExceptionCatchAll(true); + return status_code; + } + + // Starts the tuning process + auto results = std::vector(); + for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) { + try { + auto configuration = configurations[config_id]; + + // Sets the input + for (const auto id : settings.inputs) { + device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); + } + + // Sets the thread configuration + const auto global = SetThreadConfiguration(configuration, settings.global_size, + settings.mul_global, settings.div_global); + const auto local = SetThreadConfiguration(configuration, settings.local_size, + settings.mul_local, settings.div_local); + + // Sets the parameters for this configuration + auto kernel_source = std::string{""}; + for (const auto ¶meter : configuration) { + kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n"; + } + kernel_source += settings.sources; + + // Compiles the kernel + auto compiler_options = std::vector(); + const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name, + device, context, compiler_options, 0, true); + auto kernel = Kernel(program, settings.kernel_name); + + // Runs the kernel + SetArguments(V, kernel, args, device_buffers); + const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local, true); + + // Kernel run was not successful + if (time_ms == -1.0) { + continue; + } + + // Compares the results + auto l2_error = 0.0; + for (const auto id : settings.outputs) { + device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]); + for (auto index = size_t{0}; index(buffer_sizes[id]); + if (std::isnan(l2_error) || l2_error > 1.0e-4) { + throw std::runtime_error("L2 error too large"); + } + } + results.push_back(TuningResult{settings.kernel_name, time_ms, configuration}); + } + catch (...) { + } + } + + // Completed the tuning process + if (results.size() == 0) { return StatusCode::kUnexpectedError; } + + // Computes the best results + auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; + const auto best_configuration = std::min_element(results.begin(), results.end(), comparison); + const auto best_time_ms = best_configuration->score; + if (best_time_ms == 0.0) { return StatusCode::kUnexpectedError; } + + // Stores the best parameters + for (const auto config : best_configuration->config) { + parameters[config.first] = config.second; + } + return StatusCode::kSuccess; +} + +// Compiles the above function +template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc SetArguments, std::unordered_map&); +template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc SetArguments, std::unordered_map&); +template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc SetArguments, std::unordered_map&); +template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc SetArguments, std::unordered_map&); +template StatusCode TunerAPI(Queue &queue, const Arguments &args, const int V, const GetTunerDefaultsFunc GetTunerDefaults, const GetTunerSettingsFunc GetTunerSettings, const TestValidArgumentsFunc TestValidArguments, const SetConstraintsFunc SetConstraints, const SetArgumentsFunc SetArguments, std::unordered_map&); + +// ================================================================================================= +} // namespace clblast diff --git a/src/utilities/timing.cpp b/src/utilities/timing.cpp index af6a8ff2..1afb0d08 100644 --- a/src/utilities/timing.cpp +++ b/src/utilities/timing.cpp @@ -62,15 +62,16 @@ double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const } double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, - std::vector global, const std::vector &local) { + std::vector global, const std::vector &local, + const bool silent) { try { const auto time_ms = RunKernelTimed(num_runs, kernel, queue, device, global, local); - printf(" %9.2lf ms |", time_ms); + if (!silent) { printf(" %9.2lf ms |", time_ms); } return time_ms; } catch (...) { const auto status_code = DispatchExceptionCatchAll(true); - printf(" error %-5d |", static_cast(status_code)); + if (!silent) { printf(" error %-5d |", static_cast(status_code)); } return -1.0; // invalid } } diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp index c167cd5f..7761fd83 100644 --- a/src/utilities/timing.hpp +++ b/src/utilities/timing.hpp @@ -44,7 +44,8 @@ double RunKernelTimed(const size_t num_runs, Kernel &kernel, Queue &queue, const std::vector global, const std::vector &local); double TimeKernel(const size_t num_runs, Kernel &kernel, Queue &queue, const Device &device, - std::vector global, const std::vector &local); + std::vector global, const std::vector &local, + const bool silent = false); // =================================================================================================