
142 lines
5.6 KiB
Raw Normal View History

// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
// Author(s):
// Cedric Nugteren <>
// This file implements the kernel compilation functions (see the header for more information).
// =================================================================================================
#include <vector>
#include <chrono>
#include "routines/common.hpp"
#include "kernel_preprocessor.hpp"
namespace clblast {
// =================================================================================================
// Compiles a program from source code
std::shared_ptr<Program> CompileFromSource(
const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options,
const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never
const bool silent) {
auto header_string = std::string{""};
header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
// Adds the name of the routine as a define
header_string += "#define ROUTINE_" + routine_name + "\n";
// Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
// which it is known to work with all OpenCL platforms.
if (device.IsNVIDIA() || device.IsARM() || device.IsQualcomm()) {
header_string += "#define USE_INLINE_KEYWORD 1\n";
// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if ((device.IsAMD() && device.IsGPU()) || device.IsQualcomm()) {
header_string += "#define USE_CL_MAD 1\n";
// For specific devices, use staggered/shuffled workgroup indices.
if (device.IsAMD() && device.IsGPU()) {
header_string += "#define USE_STAGGERED_INDICES 1\n";
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if ((device.IsARM() && device.IsGPU()) || device.IsQualcomm()) {
header_string += "#define GLOBAL_MEM_FENCE 1\n";
// For Intel GPUs with subgroup support, use subgroup shuffling.
if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups) &&
(precision == Precision::kSingle || precision == Precision::kHalf)) {
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n";
// For NVIDIA GPUs, inline PTX can provide subgroup support
if (device.IsGPU() && device.IsNVIDIA() && precision == Precision::kSingle) {
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
// Nvidia needs to check pre or post volta due to new shuffle commands
if (device.IsPostNVIDIAVolta()) {
header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 1\n";
else {
header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n";
// For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance.
// This option compiles without the workgroup size requirement and does not affect correctness.
if (device.IsQualcomm()) {
header_string += "#define RELAX_WORKGROUP_SIZE 1\n";
// Optionally adds a translation header from OpenCL kernels to CUDA kernels
#ifdef CUDA_API
2017-12-24 12:10:55 +01:00
header_string +=
#include "kernels/opencl_to_cuda.h"
// Loads the common header (typedefs and defines and such)
header_string +=
#include "kernels/common.opencl"
// Prints details of the routine to compile in case of debugging in verbose mode
#ifdef VERBOSE
printf("[DEBUG] Compiling routine '%s-%s'\n",
routine_name.c_str(), ToString(precision).c_str());
const auto start_time = std::chrono::steady_clock::now();
// Runs a pre-processor to unroll loops and perform array-to-register promotion. Most OpenCL
// compilers do this, but some don't.
auto do_run_preprocessor = false;
if (run_preprocessor == 0) { do_run_preprocessor = (device.IsARM() && device.IsGPU()); }
if (run_preprocessor == 1) { do_run_preprocessor = true; }
auto kernel_string = header_string + source_string;
if (do_run_preprocessor) {
log_debug("Running built-in pre-processor");
kernel_string = PreprocessKernelSource(kernel_string);
// Compiles the kernel
auto program = std::make_shared<Program>(context, kernel_string);
try {
SetOpenCLKernelStandard(device, options);
program->Build(device, options);
} catch (const CLCudaAPIBuildError &e) {
if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) {
fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
return program;
// =================================================================================================
} // namespace clblast