Moved compilation function to separate file; removed dependency of tuners of the CLBlast library

pull/216/head
Cedric Nugteren 2017-11-17 20:57:46 +01:00
parent d9cf206979
commit f94d498a37
7 changed files with 163 additions and 96 deletions

View File

@ -224,6 +224,7 @@ endif()
set(SOURCES
src/database/database.cpp
src/routines/common.cpp
src/utilities/compile.cpp
src/utilities/clblast_exceptions.cpp
src/utilities/timing.cpp
src/utilities/utilities.cpp
@ -244,6 +245,7 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual
src/routines/common.hpp
src/routines/routines.hpp
src/utilities/buffer_test.hpp
src/utilities/compile.hpp
src/utilities/clblast_exceptions.hpp
src/utilities/device_mapping.hpp
src/utilities/msvc.hpp
@ -366,29 +368,42 @@ endif()
# ==================================================================================================
# This section contains all the code related to the tuners
# TODO: Remove dependency on CLBlast
if(TUNERS)
set(TUNERS_COMMON
src/utilities/compile.cpp
src/utilities/clblast_exceptions.cpp
src/utilities/timing.cpp
src/utilities/utilities.cpp
src/tuning/configurations.cpp
src/tuning/tuning.cpp)
set(TUNERS_HEADERS # such that they can be discovered by IDEs such as CLion and Visual Studio
src/utilities/compile.hpp
src/utilities/clblast_exceptions.hpp
src/utilities/timing.hpp
src/utilities/utilities.hpp
src/tuning/configurations.hpp
src/tuning/tuning.cpp
src/tuning/tuning.hpp)
# Visual Studio requires the sources of non-exported objects/libraries
if(MSVC)
set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
endif()
# Adds tuning executables
foreach(KERNEL ${KERNELS})
add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
target_link_libraries(clblast_tuner_${KERNEL} clblast ${API_LIBRARIES})
add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} ${TUNERS_HEADERS}
src/tuning/kernels/${KERNEL}.cpp)
target_include_directories(clblast_tuner_${KERNEL} PUBLIC
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
${API_INCLUDE_DIRS})
target_link_libraries(clblast_tuner_${KERNEL} ${API_LIBRARIES})
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
endforeach()
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${API_LIBRARIES})
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} ${TUNERS_HEADERS}
src/tuning/routines/${ROUTINE_TUNER}.cpp)
target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
${API_INCLUDE_DIRS})
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} ${API_LIBRARIES})
install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
endforeach()

View File

@ -19,84 +19,6 @@
namespace clblast {
// =================================================================================================
// Compiles a program from source code
Program CompileFromSource(const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options) {
auto header_string = std::string{""};
header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
// Adds the name of the routine as a define
header_string += "#define ROUTINE_" + routine_name + "\n";
// Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
// which it is known to work with all OpenCL platforms.
if (device.IsNVIDIA() || device.IsARM()) {
header_string += "#define USE_INLINE_KEYWORD 1\n";
}
// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device.IsAMD() && device.IsGPU()) {
header_string += "#define USE_CL_MAD 1\n";
}
// For specific devices, use staggered/shuffled workgroup indices.
if (device.IsAMD() && device.IsGPU()) {
header_string += "#define USE_STAGGERED_INDICES 1\n";
}
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if (device.IsARM() && device.IsGPU()) {
header_string += "#define GLOBAL_MEM_FENCE 1\n";
}
// Optionally adds a translation header from OpenCL kernels to CUDA kernels
#ifdef CUDA_API
source_string +=
#include "kernels/opencl_to_cuda.h"
;
#endif
// Loads the common header (typedefs and defines and such)
header_string +=
#include "kernels/common.opencl"
;
// Prints details of the routine to compile in case of debugging in verbose mode
#ifdef VERBOSE
printf("[DEBUG] Compiling routine '%s-%s'\n",
routine_name.c_str(), ToString(precision).c_str());
const auto start_time = std::chrono::steady_clock::now();
#endif
// Compiles the kernel
auto program = Program(context, header_string + source_string);
try {
program.Build(device, options);
} catch (const CLCudaAPIBuildError &e) {
if (program.StatusIsCompilationWarningOrError(e.status())) {
fprintf(stdout, "OpenCL compiler error/warning: %s\n",
program.GetBuildInfo(device).c_str());
}
throw;
}
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
#endif
return program;
}
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,

View File

@ -20,17 +20,12 @@
#include <vector>
#include "utilities/utilities.hpp"
#include "utilities/compile.hpp"
#include "database/database.hpp"
namespace clblast {
// =================================================================================================
// Compiles a program from source code
Program CompileFromSource(const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options);
// Enqueues a kernel, waits for completion, and checks for errors
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,

View File

@ -23,6 +23,7 @@
#include <iostream>
#include "utilities/utilities.hpp"
#include "utilities/compile.hpp"
#include "utilities/timing.hpp"
#include "tuning/configurations.hpp"

View File

@ -0,0 +1,99 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the kernel compilation functions (see the header for more information).
//
// =================================================================================================
#include <vector>
#include <chrono>
#include "routines/common.hpp"
namespace clblast {
// =================================================================================================
// Compiles a program from source code
Program CompileFromSource(const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options) {
auto header_string = std::string{""};
header_string += "#define PRECISION " + ToString(static_cast<int>(precision)) + "\n";
// Adds the name of the routine as a define
header_string += "#define ROUTINE_" + routine_name + "\n";
// Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
// which it is known to work with all OpenCL platforms.
if (device.IsNVIDIA() || device.IsARM()) {
header_string += "#define USE_INLINE_KEYWORD 1\n";
}
// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device.IsAMD() && device.IsGPU()) {
header_string += "#define USE_CL_MAD 1\n";
}
// For specific devices, use staggered/shuffled workgroup indices.
if (device.IsAMD() && device.IsGPU()) {
header_string += "#define USE_STAGGERED_INDICES 1\n";
}
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if (device.IsARM() && device.IsGPU()) {
header_string += "#define GLOBAL_MEM_FENCE 1\n";
}
// Optionally adds a translation header from OpenCL kernels to CUDA kernels
#ifdef CUDA_API
source_string +=
#include "kernels/opencl_to_cuda.h"
;
#endif
// Loads the common header (typedefs and defines and such)
header_string +=
#include "kernels/common.opencl"
;
// Prints details of the routine to compile in case of debugging in verbose mode
#ifdef VERBOSE
printf("[DEBUG] Compiling routine '%s-%s'\n",
routine_name.c_str(), ToString(precision).c_str());
const auto start_time = std::chrono::steady_clock::now();
#endif
// Compiles the kernel
auto program = Program(context, header_string + source_string);
try {
program.Build(device, options);
} catch (const CLCudaAPIBuildError &e) {
if (program.StatusIsCompilationWarningOrError(e.status())) {
fprintf(stdout, "OpenCL compiler error/warning: %s\n",
program.GetBuildInfo(device).c_str());
}
throw;
}
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
#endif
return program;
}
// =================================================================================================
} // namespace clblast

View File

@ -0,0 +1,36 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the CLBlast way to compile a kernel from source, used for the library and for
// the auto-tuners.
//
// =================================================================================================
#ifndef CLBLAST_UTILITIES_COMPILE_H_
#define CLBLAST_UTILITIES_COMPILE_H_
#include <string>
#include <vector>
#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
// Compiles a program from source code
Program CompileFromSource(const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options);
// =================================================================================================
} // namespace clblast
// CLBLAST_UTILITIES_COMPILE_H_
#endif

View File

@ -21,7 +21,6 @@
#include <chrono>
#include "utilities/utilities.hpp"
#include "routines/common.hpp"
namespace clblast {
// =================================================================================================