142 lines
5.6 KiB
C++
142 lines
5.6 KiB
C++
|
|
// =================================================================================================
|
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
|
// width of 100 characters per line.
|
|
//
|
|
// Author(s):
|
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
|
//
|
|
// This file implements the kernel compilation functions (see the header for more information).
|
|
//
|
|
// =================================================================================================
|
|
|
|
#include <vector>
|
|
#include <chrono>
|
|
|
|
#include "routines/common.hpp"
|
|
#include "kernel_preprocessor.hpp"
|
|
|
|
namespace clblast {
|
|
// =================================================================================================
|
|
|
|
// Compiles a program from source code
|
|
std::shared_ptr<Program> CompileFromSource(
|
|
const std::string &source_string, const Precision precision,
|
|
const std::string &routine_name,
|
|
const Device& device, const Context& context,
|
|
std::vector<std::string>& options,
|
|
const size_t run_preprocessor, // 0: platform dependent, 1: always, 2: never
|
|
const bool silent) {
|
|
auto header_string = std::string{""};
|
|
|
|
header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
|
|
|
|
// Adds the name of the routine as a define
|
|
header_string += "#define ROUTINE_" + routine_name + "\n";
|
|
|
|
// Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
|
|
// which it is known to work with all OpenCL platforms.
|
|
if (device.IsNVIDIA() || device.IsARM() || device.IsQualcomm()) {
|
|
header_string += "#define USE_INLINE_KEYWORD 1\n";
|
|
}
|
|
|
|
// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
|
|
// performance, but might result in a reduced accuracy.
|
|
if ((device.IsAMD() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) {
|
|
header_string += "#define USE_CL_MAD 1\n";
|
|
}
|
|
|
|
// For specific devices, use staggered/shuffled workgroup indices.
|
|
if (device.IsAMD() && device.IsGPU()) {
|
|
header_string += "#define USE_STAGGERED_INDICES 1\n";
|
|
}
|
|
|
|
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
|
|
// performance through better cache behaviour
|
|
if ((device.IsARM() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) {
|
|
header_string += "#define GLOBAL_MEM_FENCE 1\n";
|
|
}
|
|
|
|
// For Intel GPUs with subgroup support, use subgroup shuffling.
|
|
if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups) &&
|
|
(precision == Precision::kSingle || precision == Precision::kHalf)) {
|
|
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
|
|
header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n";
|
|
}
|
|
|
|
// For NVIDIA GPUs, inline PTX can provide subgroup support
|
|
if (device.IsGPU() && device.IsNVIDIA() && precision == Precision::kSingle) {
|
|
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
|
|
|
|
// Nvidia needs to check pre or post volta due to new shuffle commands
|
|
if (device.IsPostNVIDIAVolta()) {
|
|
header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 1\n";
|
|
}
|
|
else {
|
|
header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n";
|
|
}
|
|
}
|
|
|
|
// For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance.
|
|
// This option compiles without the workgroup size requirement and does not affect correctness.
|
|
if (device.IsQualcomm()) {
|
|
header_string += "#define RELAX_WORKGROUP_SIZE 1\n";
|
|
}
|
|
|
|
// Optionally adds a translation header from OpenCL kernels to CUDA kernels
|
|
#ifdef CUDA_API
|
|
header_string +=
|
|
#include "kernels/opencl_to_cuda.h"
|
|
;
|
|
#endif
|
|
|
|
// Loads the common header (typedefs and defines and such)
|
|
header_string +=
|
|
#include "kernels/common.opencl"
|
|
;
|
|
|
|
// Prints details of the routine to compile in case of debugging in verbose mode
|
|
#ifdef VERBOSE
|
|
printf("[DEBUG] Compiling routine '%s-%s'\n",
|
|
routine_name.c_str(), ToString(precision).c_str());
|
|
const auto start_time = std::chrono::steady_clock::now();
|
|
#endif
|
|
|
|
// Runs a pre-processor to unroll loops and perform array-to-register promotion. Most OpenCL
|
|
// compilers do this, but some don't.
|
|
auto do_run_preprocessor = false;
|
|
if (run_preprocessor == 0) { do_run_preprocessor = (device.IsARM() && device.IsGPU()); }
|
|
if (run_preprocessor == 1) { do_run_preprocessor = true; }
|
|
auto kernel_string = header_string + source_string;
|
|
if (do_run_preprocessor) {
|
|
log_debug("Running built-in pre-processor");
|
|
kernel_string = PreprocessKernelSource(kernel_string);
|
|
}
|
|
|
|
// Compiles the kernel
|
|
auto program = std::make_shared<Program>(context, kernel_string);
|
|
try {
|
|
SetOpenCLKernelStandard(device, options);
|
|
program->Build(device, options);
|
|
} catch (const CLCudaAPIBuildError &e) {
|
|
if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) {
|
|
fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
|
|
program->GetBuildInfo(device).c_str());
|
|
}
|
|
throw;
|
|
}
|
|
|
|
// Prints the elapsed compilation time in case of debugging in verbose mode
|
|
#ifdef VERBOSE
|
|
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
|
|
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
|
|
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
|
|
#endif
|
|
|
|
return program;
|
|
}
|
|
|
|
// =================================================================================================
|
|
} // namespace clblast
|