Moved compilation function to separate file; removed dependency of tuners of the CLBlast library
parent
d9cf206979
commit
f94d498a37
|
@ -224,6 +224,7 @@ endif()
|
|||
set(SOURCES
|
||||
src/database/database.cpp
|
||||
src/routines/common.cpp
|
||||
src/utilities/compile.cpp
|
||||
src/utilities/clblast_exceptions.cpp
|
||||
src/utilities/timing.cpp
|
||||
src/utilities/utilities.cpp
|
||||
|
@ -244,6 +245,7 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual
|
|||
src/routines/common.hpp
|
||||
src/routines/routines.hpp
|
||||
src/utilities/buffer_test.hpp
|
||||
src/utilities/compile.hpp
|
||||
src/utilities/clblast_exceptions.hpp
|
||||
src/utilities/device_mapping.hpp
|
||||
src/utilities/msvc.hpp
|
||||
|
@ -366,29 +368,42 @@ endif()
|
|||
# ==================================================================================================
|
||||
|
||||
# This section contains all the code related to the tuners
|
||||
# TODO: Remove dependency on CLBlast
|
||||
if(TUNERS)
|
||||
|
||||
set(TUNERS_COMMON
|
||||
src/utilities/compile.cpp
|
||||
src/utilities/clblast_exceptions.cpp
|
||||
src/utilities/timing.cpp
|
||||
src/utilities/utilities.cpp
|
||||
src/tuning/configurations.cpp
|
||||
src/tuning/tuning.cpp)
|
||||
set(TUNERS_HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio
|
||||
src/utilities/compile.hpp
|
||||
src/utilities/clblast_exceptions.hpp
|
||||
src/utilities/timing.hpp
|
||||
src/utilities/utilities.hpp
|
||||
src/tuning/configurations.hpp
|
||||
src/tuning/tuning.cpp
|
||||
src/tuning/tuning.hpp)
|
||||
|
||||
# Visual Studio requires the sources of non-exported objects/libraries
|
||||
if(MSVC)
|
||||
set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
|
||||
endif()
|
||||
|
||||
# Adds tuning executables
|
||||
foreach(KERNEL ${KERNELS})
|
||||
add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
|
||||
target_link_libraries(clblast_tuner_${KERNEL} clblast ${API_LIBRARIES})
|
||||
add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} ${TUNERS_HEADERS}
|
||||
src/tuning/kernels/${KERNEL}.cpp)
|
||||
target_include_directories(clblast_tuner_${KERNEL} PUBLIC
|
||||
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
|
||||
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
|
||||
${API_INCLUDE_DIRS})
|
||||
target_link_libraries(clblast_tuner_${KERNEL} ${API_LIBRARIES})
|
||||
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
|
||||
endforeach()
|
||||
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
|
||||
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
|
||||
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${API_LIBRARIES})
|
||||
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} ${TUNERS_HEADERS}
|
||||
src/tuning/routines/${ROUTINE_TUNER}.cpp)
|
||||
target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC
|
||||
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
|
||||
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
|
||||
${API_INCLUDE_DIRS})
|
||||
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} ${API_LIBRARIES})
|
||||
install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
|
||||
endforeach()
|
||||
|
||||
|
|
|
@ -19,84 +19,6 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles a program from source code
|
||||
Program CompileFromSource(const std::string &source_string, const Precision precision,
|
||||
const std::string &routine_name,
|
||||
const Device& device, const Context& context,
|
||||
std::vector<std::string>& options) {
|
||||
auto header_string = std::string{""};
|
||||
|
||||
header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
|
||||
|
||||
// Adds the name of the routine as a define
|
||||
header_string += "#define ROUTINE_" + routine_name + "\n";
|
||||
|
||||
// Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
|
||||
// which it is known to work with all OpenCL platforms.
|
||||
if (device.IsNVIDIA() || device.IsARM()) {
|
||||
header_string += "#define USE_INLINE_KEYWORD 1\n";
|
||||
}
|
||||
|
||||
// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
|
||||
// performance, but might result in a reduced accuracy.
|
||||
if (device.IsAMD() && device.IsGPU()) {
|
||||
header_string += "#define USE_CL_MAD 1\n";
|
||||
}
|
||||
|
||||
// For specific devices, use staggered/shuffled workgroup indices.
|
||||
if (device.IsAMD() && device.IsGPU()) {
|
||||
header_string += "#define USE_STAGGERED_INDICES 1\n";
|
||||
}
|
||||
|
||||
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
|
||||
// performance through better cache behaviour
|
||||
if (device.IsARM() && device.IsGPU()) {
|
||||
header_string += "#define GLOBAL_MEM_FENCE 1\n";
|
||||
}
|
||||
|
||||
// Optionally adds a translation header from OpenCL kernels to CUDA kernels
|
||||
#ifdef CUDA_API
|
||||
source_string +=
|
||||
#include "kernels/opencl_to_cuda.h"
|
||||
;
|
||||
#endif
|
||||
|
||||
// Loads the common header (typedefs and defines and such)
|
||||
header_string +=
|
||||
#include "kernels/common.opencl"
|
||||
;
|
||||
|
||||
// Prints details of the routine to compile in case of debugging in verbose mode
|
||||
#ifdef VERBOSE
|
||||
printf("[DEBUG] Compiling routine '%s-%s'\n",
|
||||
routine_name.c_str(), ToString(precision).c_str());
|
||||
const auto start_time = std::chrono::steady_clock::now();
|
||||
#endif
|
||||
|
||||
// Compiles the kernel
|
||||
auto program = Program(context, header_string + source_string);
|
||||
try {
|
||||
program.Build(device, options);
|
||||
} catch (const CLCudaAPIBuildError &e) {
|
||||
if (program.StatusIsCompilationWarningOrError(e.status())) {
|
||||
fprintf(stdout, "OpenCL compiler error/warning: %s\n",
|
||||
program.GetBuildInfo(device).c_str());
|
||||
}
|
||||
throw;
|
||||
}
|
||||
|
||||
// Prints the elapsed compilation time in case of debugging in verbose mode
|
||||
#ifdef VERBOSE
|
||||
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
|
||||
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
|
||||
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
|
||||
#endif
|
||||
|
||||
return program;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Enqueues a kernel, waits for completion, and checks for errors
|
||||
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
|
||||
std::vector<size_t> global, const std::vector<size_t> &local,
|
||||
|
|
|
@ -20,17 +20,12 @@
|
|||
#include <vector>
|
||||
|
||||
#include "utilities/utilities.hpp"
|
||||
#include "utilities/compile.hpp"
|
||||
#include "database/database.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles a program from source code
|
||||
Program CompileFromSource(const std::string &source_string, const Precision precision,
|
||||
const std::string &routine_name,
|
||||
const Device& device, const Context& context,
|
||||
std::vector<std::string>& options);
|
||||
|
||||
// Enqueues a kernel, waits for completion, and checks for errors
|
||||
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
|
||||
std::vector<size_t> global, const std::vector<size_t> &local,
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include <iostream>
|
||||
|
||||
#include "utilities/utilities.hpp"
|
||||
#include "utilities/compile.hpp"
|
||||
#include "utilities/timing.hpp"
|
||||
#include "tuning/configurations.hpp"
|
||||
|
||||
|
|
|
@ -0,0 +1,99 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file implements the kernel compilation functions (see the header for more information).
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
|
||||
#include "routines/common.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles a program from source code
|
||||
Program CompileFromSource(const std::string &source_string, const Precision precision,
|
||||
const std::string &routine_name,
|
||||
const Device& device, const Context& context,
|
||||
std::vector<std::string>& options) {
|
||||
auto header_string = std::string{""};
|
||||
|
||||
header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
|
||||
|
||||
// Adds the name of the routine as a define
|
||||
header_string += "#define ROUTINE_" + routine_name + "\n";
|
||||
|
||||
// Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
|
||||
// which it is known to work with all OpenCL platforms.
|
||||
if (device.IsNVIDIA() || device.IsARM()) {
|
||||
header_string += "#define USE_INLINE_KEYWORD 1\n";
|
||||
}
|
||||
|
||||
// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
|
||||
// performance, but might result in a reduced accuracy.
|
||||
if (device.IsAMD() && device.IsGPU()) {
|
||||
header_string += "#define USE_CL_MAD 1\n";
|
||||
}
|
||||
|
||||
// For specific devices, use staggered/shuffled workgroup indices.
|
||||
if (device.IsAMD() && device.IsGPU()) {
|
||||
header_string += "#define USE_STAGGERED_INDICES 1\n";
|
||||
}
|
||||
|
||||
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
|
||||
// performance through better cache behaviour
|
||||
if (device.IsARM() && device.IsGPU()) {
|
||||
header_string += "#define GLOBAL_MEM_FENCE 1\n";
|
||||
}
|
||||
|
||||
// Optionally adds a translation header from OpenCL kernels to CUDA kernels
|
||||
#ifdef CUDA_API
|
||||
source_string +=
|
||||
#include "kernels/opencl_to_cuda.h"
|
||||
;
|
||||
#endif
|
||||
|
||||
// Loads the common header (typedefs and defines and such)
|
||||
header_string +=
|
||||
#include "kernels/common.opencl"
|
||||
;
|
||||
|
||||
// Prints details of the routine to compile in case of debugging in verbose mode
|
||||
#ifdef VERBOSE
|
||||
printf("[DEBUG] Compiling routine '%s-%s'\n",
|
||||
routine_name.c_str(), ToString(precision).c_str());
|
||||
const auto start_time = std::chrono::steady_clock::now();
|
||||
#endif
|
||||
|
||||
// Compiles the kernel
|
||||
auto program = Program(context, header_string + source_string);
|
||||
try {
|
||||
program.Build(device, options);
|
||||
} catch (const CLCudaAPIBuildError &e) {
|
||||
if (program.StatusIsCompilationWarningOrError(e.status())) {
|
||||
fprintf(stdout, "OpenCL compiler error/warning: %s\n",
|
||||
program.GetBuildInfo(device).c_str());
|
||||
}
|
||||
throw;
|
||||
}
|
||||
|
||||
// Prints the elapsed compilation time in case of debugging in verbose mode
|
||||
#ifdef VERBOSE
|
||||
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
|
||||
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
|
||||
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
|
||||
#endif
|
||||
|
||||
return program;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
|
@ -0,0 +1,36 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file contains the CLBlast way to compile a kernel from source, used for the library and for
|
||||
// the auto-tuners.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_UTILITIES_COMPILE_H_
|
||||
#define CLBLAST_UTILITIES_COMPILE_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "utilities/utilities.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Compiles a program from source code
|
||||
Program CompileFromSource(const std::string &source_string, const Precision precision,
|
||||
const std::string &routine_name,
|
||||
const Device& device, const Context& context,
|
||||
std::vector<std::string>& options);
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
||||
// CLBLAST_UTILITIES_COMPILE_H_
|
||||
#endif
|
|
@ -21,7 +21,6 @@
|
|||
#include <chrono>
|
||||
|
||||
#include "utilities/utilities.hpp"
|
||||
#include "routines/common.hpp"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
|
Loading…
Reference in New Issue