Merge branch 'master' into CLBlast-267-convgemm

pull/319/head
Cedric Nugteren 2018-06-03 15:53:27 +02:00
commit 1c9a741470
26 changed files with 453 additions and 150 deletions

View File

@ -57,8 +57,8 @@ build_script:
after_build:
- ps: pushd $env:CLBLAST_BUILD
- 7z a CLBlast-1.3.0-Windows-x64.zip .\install_dir\*
- ps: mv CLBlast-1.3.0-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
- 7z a CLBlast-1.4.0-Windows-x64.zip .\install_dir\*
- ps: mv CLBlast-1.4.0-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
artifacts:
- path: '*.zip'

View File

@ -21,7 +21,7 @@ matrix:
env:
global:
- CLBLAST_VERSION=1.3.0
- CLBLAST_VERSION=1.4.0
- CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
- CLBLAST_INSTALL=${TRAVIS_BUILD_DIR}/bin/CLBlast-${CLBLAST_VERSION}
- CLBLAST_TAR=CLBlast-${CLBLAST_VERSION}-${TRAVIS_OS_NAME}-x64.tar.gz

View File

@ -1,5 +1,5 @@
Development (next version)
Version 1.4.0
- Added Python interface to CLBlast 'PyCLBlast'
- Added CLBlast to Ubuntu PPA and macOS Homebrew package managers
- Added an API to run the tuners programmatically without any I/O
@ -8,7 +8,8 @@ Development (next version)
- Re-added a local memory size constraint to the tuners
- The routine tuners now automatically pick up tuning results from disk from the kernel tuners
- Updated and reorganised the CLBlast documentation
- Added a 'canary' region to check for overflows in the tuner and tests (insipred by clARMOR)
- Added a 'canary' region to check for overflows in the tuner and tests (inspired by clARMOR)
- Added an option to test against and compare performance with Intel's MKL
- Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program
- Fixed incorrect releasing of the OpenCL program resulting in segfaults / access violations
- Various minor fixes and enhancements

View File

@ -21,7 +21,7 @@ endif()
# CMake project details
project("clblast" C CXX)
set(clblast_VERSION_MAJOR 1)
set(clblast_VERSION_MINOR 3)
set(clblast_VERSION_MINOR 4)
set(clblast_VERSION_PATCH 0)
set(clblast_VERSION "${clblast_VERSION_MAJOR}.${clblast_VERSION_MINOR}.${clblast_VERSION_PATCH}")
set(clblast_SOVERSION ${clblast_VERSION_MAJOR})
@ -170,16 +170,17 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL Android)
else()
# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake",
# "FindCBLAS.cmake" and "FindcuBLAS.cmake" are included.
# "FindCBLAS.cmake", "FindMKL.cmake", and "FindcuBLAS.cmake" are included.
if(CLIENTS OR TESTS)
find_package(CBLAS)
find_package(MKL)
if(OPENCL)
find_package(clBLAS)
endif()
if(CUBLAS)
find_package(cuBLAS)
endif()
if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND AND NOT MKL_FOUND)
if(TESTS)
message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
set(TESTS OFF)
@ -423,12 +424,14 @@ if(TUNERS)
target_include_directories(clblast_tuner_${KERNEL} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
endforeach()
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp test/test_utilities.cpp)
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS} ${clblast_SOURCE_DIR})
install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
endforeach()
if(OPENCL)
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp test/test_utilities.cpp)
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS} ${clblast_SOURCE_DIR})
install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
endforeach()
endif()
# Adds 'alltuners' target: runs all tuners for all precisions
set(ALLTUNERS )
@ -439,12 +442,14 @@ if(TUNERS)
endforeach()
set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL})
endforeach()
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
foreach(PRECISION ${PRECISIONS})
set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION})
if(OPENCL)
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
foreach(PRECISION ${PRECISIONS})
set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION})
endforeach()
set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER})
endforeach()
set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER})
endforeach()
endif()
add_custom_target(alltuners ${ALLTUNERS} DEPENDS ${ALLTUNERSDEPENDS})
endif()
@ -468,9 +473,19 @@ if(CLIENTS OR TESTS)
add_definitions(" -DCLBLAST_REF_CLBLAS")
endif()
endif()
if(CBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
if(CBLAS_FOUND OR MKL_FOUND)
if(MKL_FOUND) # prefers MKL over another CBLAS implementation
set(REF_INCLUDES ${REF_INCLUDES} ${MKL_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${MKL_LIBRARIES})
if(MSVC)
add_definitions(" /DCLBLAST_REF_CBLAS_MKL")
else()
add_definitions(" -DCLBLAST_REF_CBLAS_MKL")
endif()
else()
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
endif()
set(WRAPPERS ${WRAPPERS} test/wrapper_cblas.hpp)
if(MSVC)
add_definitions(" /DCLBLAST_REF_CBLAS")

View File

@ -2,9 +2,9 @@
CLBlast: The tuned OpenCL BLAS library
================
| | Build status | Tests on Intel CPU | Tests on NVIDIA GPU | Tests on Intel GPU |
| | Build status | Tests on Intel CPU | Tests on NVIDIA GPU | Other tests |
|-----|-----|-----|-----|-----|
| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-i7-4790k) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-nvidia-k5000.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-nvidia-k5000) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-HD4600.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-HD4600) |
| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-i7-4790k) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-nvidia-k5000.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-nvidia-k5000) | N/A |
| Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-intel-e5-2620-v4.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-intel-e5-2620-v4) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-k80.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-k80) | N/A |
| OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-osx-intel-i5-4278U.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-osx-intel-i5-4278U) | N/A | N/A |

View File

@ -18,8 +18,8 @@ This file gives an overview of the main features planned for addition to CLBlast
| [#223](https://github.com/CNugteren/CLBlast/issues/223) | Feb '18 | CNugteren | ✔ | Python OpenCL interface |
| [#237](https://github.com/CNugteren/CLBlast/issues/237) | Mar '18 | CNugteren | ✔ | Making tuning possible from the CLBlast API |
| [#228](https://github.com/CNugteren/CLBlast/issues/228) | Mar-Apr '18 | CNugteren | ✔ | Improving performance for Qualcomm Adreno GPUs |
| [#267](https://github.com/CNugteren/CLBlast/issues/267) | May '18 | CNugteren | | Merge im2col and GEMM into a direct kernel |
| [#270](https://github.com/CNugteren/CLBlast/issues/270) | July '18 | CNugteren | | Implement col2im |
| - | July '18 | CNugteren | | Add a SYCL interface to the library |
| [#267](https://github.com/CNugteren/CLBlast/issues/267) | July '18 | CNugteren | | Merge im2col and GEMM into a direct kernel |
| [#270](https://github.com/CNugteren/CLBlast/issues/270) | Aug '18 | CNugteren | | Implement col2im |
| - | Aug '18 | CNugteren | | Add a SYCL interface to the library |
| [#136](https://github.com/CNugteren/CLBlast/issues/136) | ?? | CNugteren | | Implement xAXPBY and xSET |
| [#169](https://github.com/CNugteren/CLBlast/issues/169) | ?? | dividiti | | Problem-specific tuning parameter selection |

View File

@ -48,7 +48,7 @@ mark_as_advanced(CBLAS_INCLUDE_DIRS)
# Finds the library
find_library(CBLAS_LIBRARIES
NAMES cblas blas mkl blis openblas accelerate
NAMES cblas blas blis openblas accelerate
HINTS ${CBLAS_HINTS}
PATH_SUFFIXES
lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import

View File

@ -0,0 +1,72 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
# width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# ==================================================================================================
#
# Defines the following variables:
# MKL_FOUND Boolean holding whether or not the Intel MKL BLAS library was found
# MKL_INCLUDE_DIRS The Intel MKL BLAS include directory
# MKL_LIBRARIES The Intel MKL BLAS library
#
# In case MKL is not installed in the default directory, set the MKL_ROOT variable to point to
# the root of MKL, such that 'mkl_cblas.h' can be found in $MKL_ROOT/include. This can either be
# done using an environmental variable (e.g. export MKL_ROOT=/path/to/MKL) or using a CMake
# variable (e.g. cmake -DMKL_ROOT=/path/to/MKL ..).
#
# ==================================================================================================
# Sets the possible install locations
set(MKL_HINTS
${MKL_ROOT}
$ENV{MKL_ROOT}
)
set(MKL_PATHS
/usr
/usr/local
/usr/local/opt
/usr/local/mkl
/opt/intel
/opt/intel/mkl
)
# Finds the include directories
find_path(MKL_INCLUDE_DIRS
NAMES mkl_cblas.h
HINTS ${MKL_HINTS}
PATH_SUFFIXES
include inc include/x86_64 include/x64
PATHS ${MKL_PATHS}
DOC "Intel MKL CBLAS include header mkl_cblas.h"
)
mark_as_advanced(MKL_INCLUDE_DIRS)
# Finds the libraries
set(MKL_LIB_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import lib/intel64)
find_library(MKL_LIBRARIES_LP64 NAMES mkl_intel_lp64 HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL lp64 library")
find_library(MKL_LIBRARIES_THREAD NAMES mkl_intel_thread HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL thread library")
find_library(MKL_LIBRARIES_CORE NAMES mkl_core HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL core library")
find_library(MKL_LIBRARIES_OMP NAMES iomp5 HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel OpenMP library")
set(MKL_LIBRARIES ${MKL_LIBRARIES_LP64} ${MKL_LIBRARIES_THREAD} ${MKL_LIBRARIES_CORE} ${MKL_LIBRARIES_OMP})
mark_as_advanced(MKL_LIBRARIES)
# ==================================================================================================
# Notification messages
if(NOT MKL_INCLUDE_DIRS)
message(STATUS "Could NOT find 'mkl_cblas.h', install MKL or set MKL_ROOT")
endif()
if(NOT MKL_LIBRARIES)
message(STATUS "Could NOT find the Intel MKL BLAS library, install it or set MKL_ROOT")
endif()
# Determines whether or not MKL was found
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(MKL DEFAULT_MSG MKL_INCLUDE_DIRS MKL_LIBRARIES)
# ==================================================================================================

View File

@ -84,6 +84,74 @@ The kernels `gemm` and `gemm_direct` have too many parameters to explore. Theref
There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. However, they do automatically pick up kernel tuning results from the current folder if there are any. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
Here are all the tuners included in the `make alltuners` target (in the same order) with all their precision arguments:
./clblast_tuner_copy_fast -precision 32
./clblast_tuner_copy_fast -precision 64
./clblast_tuner_copy_fast -precision 3232
./clblast_tuner_copy_fast -precision 6464
./clblast_tuner_copy_fast -precision 16
./clblast_tuner_copy_pad -precision 32
./clblast_tuner_copy_pad -precision 64
./clblast_tuner_copy_pad -precision 3232
./clblast_tuner_copy_pad -precision 6464
./clblast_tuner_copy_pad -precision 16
./clblast_tuner_transpose_fast -precision 32
./clblast_tuner_transpose_fast -precision 64
./clblast_tuner_transpose_fast -precision 3232
./clblast_tuner_transpose_fast -precision 6464
./clblast_tuner_transpose_fast -precision 16
./clblast_tuner_transpose_pad -precision 32
./clblast_tuner_transpose_pad -precision 64
./clblast_tuner_transpose_pad -precision 3232
./clblast_tuner_transpose_pad -precision 6464
./clblast_tuner_transpose_pad -precision 16
./clblast_tuner_xaxpy -precision 32
./clblast_tuner_xaxpy -precision 64
./clblast_tuner_xaxpy -precision 3232
./clblast_tuner_xaxpy -precision 6464
./clblast_tuner_xaxpy -precision 16
./clblast_tuner_xdot -precision 32
./clblast_tuner_xdot -precision 64
./clblast_tuner_xdot -precision 3232
./clblast_tuner_xdot -precision 6464
./clblast_tuner_xdot -precision 16
./clblast_tuner_xger -precision 32
./clblast_tuner_xger -precision 64
./clblast_tuner_xger -precision 3232
./clblast_tuner_xger -precision 6464
./clblast_tuner_xger -precision 16
./clblast_tuner_xgemm -precision 32
./clblast_tuner_xgemm -precision 64
./clblast_tuner_xgemm -precision 3232
./clblast_tuner_xgemm -precision 6464
./clblast_tuner_xgemm -precision 16
./clblast_tuner_xgemm_direct -precision 32
./clblast_tuner_xgemm_direct -precision 64
./clblast_tuner_xgemm_direct -precision 3232
./clblast_tuner_xgemm_direct -precision 6464
./clblast_tuner_xgemm_direct -precision 16
./clblast_tuner_xgemv -precision 32
./clblast_tuner_xgemv -precision 64
./clblast_tuner_xgemv -precision 3232
./clblast_tuner_xgemv -precision 6464
./clblast_tuner_xgemv -precision 16
./clblast_tuner_invert -precision 32
./clblast_tuner_invert -precision 64
./clblast_tuner_invert -precision 3232
./clblast_tuner_invert -precision 6464
./clblast_tuner_invert -precision 16
./clblast_tuner_routine_xgemm -precision 32
./clblast_tuner_routine_xgemm -precision 64
./clblast_tuner_routine_xgemm -precision 3232
./clblast_tuner_routine_xgemm -precision 6464
./clblast_tuner_routine_xgemm -precision 16
./clblast_tuner_routine_xtrsv -precision 32
./clblast_tuner_routine_xtrsv -precision 64
./clblast_tuner_routine_xtrsv -precision 3232
./clblast_tuner_routine_xtrsv -precision 6464
./clblast_tuner_routine_xtrsv -precision 16
Using the tuning results
-------------

View File

@ -678,8 +678,8 @@ public:
}
// Regular constructor with memory management
explicit Kernel(const Program &program, const std::string &name): name_(name) {
CheckError(cuModuleGetFunction(&kernel_, program.GetModule(), name.c_str()));
explicit Kernel(const std::shared_ptr<Program> program, const std::string &name): name_(name) {
CheckError(cuModuleGetFunction(&kernel_, program->GetModule(), name.c_str()));
}
// Sets a kernel argument at the indicated position. This stores both the value of the argument

View File

@ -41,7 +41,7 @@ const DatabaseEntry XgerApple = {
"Xger", Precision::kAny, {"WGS1", "WGS2", "WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
};
const DatabaseEntry XtrsvApple = {
"Xtrsv", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
"Xtrsv", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
};
const DatabaseEntry XgemmApple = {
"Xgemm", Precision::kAny, {"GEMMK", "KREG", "KWG", "KWI", "MDIMA", "MDIMC", "MWG", "NDIMB", "NDIMC", "NWG", "SA", "SB", "STRM", "STRN", "VWM", "VWN"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1 } } } } } } }
@ -62,7 +62,10 @@ const DatabaseEntry PadtransposeApple = {
"Padtranspose", Precision::kAny, {"PADTRA_PAD", "PADTRA_TILE", "PADTRA_WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
};
const DatabaseEntry InvertApple = {
"Invert", Precision::kAny, {"INTERNAL_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
"Invert", Precision::kAny, {"INTERNAL_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
};
const DatabaseEntry TrsvRoutineApple = {
"TrsvRoutine", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
};
// =================================================================================================

View File

@ -45,7 +45,8 @@ const std::vector<database::DatabaseEntry> Database::apple_cpu_fallback = std::v
database::XgemvApple, database::XgemvFastApple, database::XgemvFastRotApple, database::XgerApple, database::XtrsvApple,
database::XgemmApple, database::XgemmDirectApple,
database::CopyApple, database::PadApple, database::TransposeApple, database::PadtransposeApple,
database::InvertApple
database::InvertApple,
database::TrsvRoutineApple
};
// The default values
@ -98,7 +99,8 @@ Database::Database(const Device &device, const std::string &kernel_name,
if (device.Type() == "CPU") {
const auto extensions = device.Capabilities();
const auto is_apple = (extensions.find("cl_APPLE_SetMemObjectDestructor") == std::string::npos) ? false : true;
if (is_apple) {
const auto is_likely_apple = device.MaxWorkGroupSize() <= 32;
if (is_apple || is_likely_apple) {
databases.push_front(apple_cpu_fallback);
}
}

View File

@ -18,7 +18,7 @@ R"(
// =================================================================================================
#if defined(ROUTINE_TRSV)
__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
__kernel
void FillVector(const int n, const int inc, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);

View File

@ -19,7 +19,7 @@ R"(
#if defined(ROUTINE_INVERT)
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -28,7 +28,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@ -36,7 +36,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s
}
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -45,7 +45,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@ -53,7 +53,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s
}
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -62,7 +62,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@ -72,7 +72,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
// =================================================================================================
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -81,7 +81,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@ -89,7 +89,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s
}
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -98,7 +98,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@ -106,7 +106,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s
}
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@ -115,7 +115,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
__kernel
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];

View File

@ -76,7 +76,7 @@ R"(
// =================================================================================================
#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
__kernel
void FillMatrix(const int m, const int n, const int ld, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);

View File

@ -13,6 +13,7 @@
#include <vector>
#include <chrono>
#include <iostream>
#include "routines/common.hpp"
@ -38,13 +39,22 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
auto local_size = size_t{1};
for (auto &item: local) { local_size *= item; }
if (local_size > device.MaxWorkGroupSize()) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal,
ToString(local_size) + " is larger than " + ToString(device.MaxWorkGroupSize()));
}
// Make sure the global thread sizes are at least equal to the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
if (global[i] < local[i]) { global[i] = local[i]; }
}
// Verify that the global thread sizes are a multiple of the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
if ((global[i] / local[i]) * local[i] != global[i]) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim,
ToString(global[i]) + " is not divisible by " + ToString(local[i]));
}
}
}
// Tests for local memory usage
@ -77,11 +87,10 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
const std::shared_ptr<Program> program, const Databases &,
const std::shared_ptr<Program> program,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t m, const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
const T constant_value) {
const Buffer<T> &dest, const T constant_value, const size_t local_size) {
auto kernel = Kernel(program, "FillMatrix");
kernel.SetArgument(0, static_cast<int>(m));
kernel.SetArgument(1, static_cast<int>(n));
@ -89,63 +98,62 @@ void FillMatrix(Queue &queue, const Device &device,
kernel.SetArgument(3, static_cast<int>(offset));
kernel.SetArgument(4, dest());
kernel.SetArgument(5, GetRealArg(constant_value));
auto local = std::vector<size_t>{16, 1};
auto global = std::vector<size_t>{Ceil(m, 16), n};
auto local = std::vector<size_t>{local_size, 1};
auto global = std::vector<size_t>{Ceil(m, local_size), n};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
// Compiles the above function
template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<half>&, const half);
template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
const size_t, const size_t, const Buffer<half>&, const half, const size_t);
template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<float>&, const float);
template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
const size_t, const size_t, const Buffer<float>&, const float, const size_t);
template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<double>&, const double);
template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
const size_t, const size_t, const Buffer<double>&, const double, const size_t);
template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<float2>&, const float2);
template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
const size_t, const size_t, const Buffer<float2>&, const float2, const size_t);
template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<double2>&, const double2);
const size_t, const size_t, const Buffer<double2>&, const double2, const size_t);
// Sets all elements of a vector to a constant value
template <typename T>
void FillVector(Queue &queue, const Device &device,
const std::shared_ptr<Program> program, const Databases &,
const std::shared_ptr<Program> program,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t inc, const size_t offset,
const Buffer<T> &dest,
const T constant_value) {
const Buffer<T> &dest, const T constant_value, const size_t local_size) {
auto kernel = Kernel(program, "FillVector");
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, static_cast<int>(inc));
kernel.SetArgument(2, static_cast<int>(offset));
kernel.SetArgument(3, dest());
kernel.SetArgument(4, GetRealArg(constant_value));
auto local = std::vector<size_t>{16};
auto global = std::vector<size_t>{Ceil(n, 16)};
auto local = std::vector<size_t>{local_size};
auto global = std::vector<size_t>{Ceil(n, local_size)};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
// Compiles the above function
template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<half>&, const half);
template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
const size_t, const Buffer<half>&, const half, const size_t);
template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<float>&, const float);
template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
const size_t, const Buffer<float>&, const float, const size_t);
template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<double>&, const double);
template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
const size_t, const Buffer<double>&, const double, const size_t);
template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<float2>&, const float2);
template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
const size_t, const Buffer<float2>&, const float2, const size_t);
template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<double2>&, const double2);
const size_t, const Buffer<double2>&, const double2, const size_t);
// =================================================================================================
} // namespace clblast

View File

@ -36,20 +36,18 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
const std::shared_ptr<Program> program, const Databases &,
const std::shared_ptr<Program> program,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t m, const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
const T constant_value);
const Buffer<T> &dest, const T constant_value, const size_t local_size);
// Sets all elements of a vector to a constant value
template <typename T>
void FillVector(Queue &queue, const Device &device,
const std::shared_ptr<Program> program, const Databases &,
const std::shared_ptr<Program> program,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t inc, const size_t offset,
const Buffer<T> &dest,
const T constant_value);
const Buffer<T> &dest, const T constant_value, const size_t local_size);
// =================================================================================================

View File

@ -68,7 +68,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
// Launches the kernel
const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
const auto global = std::vector<size_t>{1};
const auto global = std::vector<size_t>{Ceil(n, db_["TRSV_BLOCK_SIZE"])};
auto event = Event();
RunKernel(kernel, queue_, device_, global, local, event.pointer());
event.WaitForCompletion();
@ -87,6 +87,11 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
// Makes sure all dimensions are larger than zero
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Some parts of this kernel are not tunable and thus require some minimal OpenCL properties
if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16
throw RuntimeErrorCode(StatusCode::kNotImplemented);
}
// Tests the matrix and vector
TestMatrixA(n, n, a_buffer, a_offset, a_ld);
TestVectorX(n, b_buffer, b_offset, b_inc);
@ -102,8 +107,8 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
// Fills the output buffer with zeros
auto eventWaitList = std::vector<Event>();
auto fill_vector_event = Event();
FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList,
n, x_inc, x_offset, x_buffer, ConstantZero<T>());
FillVector(queue_, device_, program_, fill_vector_event.pointer(), eventWaitList,
n, x_inc, x_offset, x_buffer, ConstantZero<T>(), 16);
fill_vector_event.WaitForCompletion();
// Derives properties based on the arguments

View File

@ -25,9 +25,9 @@ class Xgemm: public Routine {
public:
// Defines the assumptions of the GEMM kernels
static const bool a_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
static const bool b_want_rotated_(const size_t gemm_kernel_id) { return true; }
static const bool c_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
static bool a_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
static bool b_want_rotated_(const size_t) { return true; }
static bool c_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
// Computes the size of the temporary GEMM buffer based on user-arguments
static size_t GetTempSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,

View File

@ -78,6 +78,11 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Some parts of this kernel are not tunable and thus require some minimal OpenCL properties
if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16
throw RuntimeErrorCode(StatusCode::kNotImplemented);
}
// Computes the k dimension. This is based on whether or not matrix is A (on the left)
// or B (on the right) in the Xgemm routine.
const auto k = (side == Side::kLeft) ? m : n;
@ -105,8 +110,8 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
// Fills the output buffer with zeros
auto eventWaitList = std::vector<Event>();
auto fill_matrix_event = Event();
FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList,
x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero<T>());
FillMatrix(queue_, device_, program_, fill_matrix_event.pointer(), eventWaitList,
x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero<T>(), 16);
fill_matrix_event.WaitForCompletion();
// Inverts the diagonal blocks

View File

@ -49,9 +49,16 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
throw BLASError(StatusCode::kInvalidDimension);
}
// Some parts of this kernel are not tunable and thus require some minimal OpenCL properties
if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16
throw RuntimeErrorCode(StatusCode::kNotImplemented);
}
// Helper variables
const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
assert(internal_block_size == 16);
if (internal_block_size != 16) {
throw RuntimeErrorCode(StatusCode::kNotImplemented); // e.g. Apple CPU OpenCL with a WGS of 1
} // when barriers are present
const auto num_blocks = CeilDiv(n, block_size);
const auto num_internal_blocks = CeilDiv(n, internal_block_size);
const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
@ -75,8 +82,9 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
// Fills the output buffer with zeros
auto event_wait_list = std::vector<Event>();
auto fill_matrix_event = Event();
FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list,
block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
FillMatrix(queue_, device_, program_, fill_matrix_event.pointer(), event_wait_list,
block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>(),
16);
event_wait_list.push_back(fill_matrix_event);
// Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
@ -89,11 +97,11 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
kernel.SetArgument(5, static_cast<int>(block_size));
kernel.SetArgument(6, static_cast<int>(unit_diagonal));
kernel.SetArgument(7, static_cast<int>(is_upper));
const auto local = std::vector<size_t>{internal_block_size};
const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
const auto local_invert = std::vector<size_t>{internal_block_size};
const auto global_invert = std::vector<size_t>{num_internal_blocks * internal_block_size};
auto base_kernel_event = Event();
auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
RunKernel(kernel, queue_, device_, global_invert, local_invert, base_kernel_event_pointer, event_wait_list);
if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
// Builds up block_size x block_size blocks. For example, internal_block_size=16:
@ -107,7 +115,8 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
const auto npages = CeilDiv(n, current_size*2);
const auto local0 = (current_size <= 32) ? current_size/4 : 16;
const auto local = std::vector<size_t>{local0, 4};
const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
const auto global = std::vector<size_t>{Ceil(current_size/local[1], local[0]),
Ceil(npages*(current_size/16)*local[1], local[1])};
// Part 1
auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);

View File

@ -25,14 +25,15 @@ namespace clblast {
// =================================================================================================
template <typename T>
void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
void RunGemmRoutineMNK(const size_t m, const size_t n, const size_t k,
const Queue& queue, const std::vector<Buffer<T>>& buffers) {
auto queue_plain = queue();
auto event = cl_event{};
auto status = Gemm(Layout::kRowMajor, Transpose::kNo, Transpose::kNo,
value, value, value, ConstantOne<T>(),
buffers[0](), 0, value,
buffers[1](), 0, value, ConstantOne<T>(),
buffers[2](), 0, value,
m, n, k, ConstantOne<T>(),
buffers[0](), 0, k,
buffers[1](), 0, n, ConstantOne<T>(),
buffers[2](), 0, n,
&queue_plain, &event);
if (status != StatusCode::kSuccess) {
throw RuntimeError("Gemm failed with status " + ToString(status));
@ -40,6 +41,10 @@ void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Bu
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
template <typename T>
void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
RunGemmRoutineMNK(value, value, value, queue, buffers);
}
template <typename T, size_t batch_count>
void RunGemmBatchedRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
@ -80,6 +85,55 @@ void RunGemmStridedBatchedRoutine(const size_t value, const Queue& queue, const
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
// =================================================================================================
template <typename T>
void TuneGemmSingleSize(const Platform& platform, const Device& device, const Context& context, Queue& queue,
const size_t m, const size_t n, const size_t k, const size_t num_runs) {
// Buffers
auto buffers = std::vector<Buffer<T>>{
Buffer<T>(context, m * k),
Buffer<T>(context, k * n),
Buffer<T>(context, m * n)
};
const auto FunctionToTune = [&]() { RunGemmRoutineMNK(m, n, k, queue, buffers); };
// Collects the timings for two methods
auto scores = std::vector<TuningResult>();
const auto methods = std::vector<std::string>{"in-direct", "direct"};
for (auto& method: methods) {
printf("* Testing the %s routine\n", method.c_str());
const auto limit = (method == "in-direct") ? 0 : std::max(std::max(m, n), k) + 1; // small or large number
ForceSelectIndirectFrom<T>(limit, device, "GemmRoutine", "XGEMM_MIN_INDIRECT_SIZE");
auto time_ms = -1.0;
try {
time_ms = TimeFunction(num_runs, FunctionToTune);
printf(" --> %9.2lf ms\n", time_ms);
}
catch (...) {
const auto status_code = DispatchExceptionCatchAll(true);
printf(" --> error %-5d\n", static_cast<int>(status_code));
}
auto tuning_results = Configuration();
tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = limit;
tuning_results["PRECISION"] = static_cast<size_t>(PrecisionValue<T>());
scores.push_back(TuningResult{"gemm_kernel_selection_single_size", time_ms, tuning_results});
}
// Outputs the results as JSON to disk, including some meta-data
const auto precision_string = std::to_string(static_cast<size_t>(PrecisionValue<T>()));
auto metadata = std::vector<std::pair<std::string,std::string>>{
{"kernel_family", "gemm_routine_single_size"},
{"precision", precision_string},
{"arg_m", ToString(m)},
{"arg_n", ToString(n)},
{"arg_k", ToString(k)},
};
PrintTimingsToFileAsJSON("clblast_gemm_routine_single_size_" + precision_string + ".json",
device, platform, metadata, scores);
}
// =================================================================================================
@ -91,6 +145,9 @@ void TuneXgemm(int argc, char* argv[]) {
const auto device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
const auto precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle);
const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10});
const auto arg_m = GetArgument(command_line_args, help, kArgM, -1); // optional
const auto arg_n = GetArgument(command_line_args, help, kArgN, -1); // optional
const auto arg_k = GetArgument(command_line_args, help, kArgK, -1); // optional
fprintf(stdout, "%s\n", help.c_str());
// OpenCL initialisation
@ -119,16 +176,29 @@ void TuneXgemm(int argc, char* argv[]) {
}
}
// Run the tuners for the XGEMM routines
TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
64, 2048, 64, 1, num_runs,
"gemm", "GemmRoutine", "gemm_routine", "XGEMM_MIN_INDIRECT_SIZE");
//TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmBatchedRoutine<T, 30>,
// 16, 128, 32, 30, num_runs,
// "gemmbatched", "GemmRoutine", "gemm_routine_2", "XGEMMBATCHED_MIN_INDIRECT_SIZE");
//TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmStridedBatchedRoutine<T, 30>,
// 16, 128, 32, 30, num_runs,
// "gemmstridedbatched", "GemmRoutine", "gemm_routine_3", "XGEMMSTRIDEDBATCHED_MIN_INDIRECT_SIZE");
// Test for only one m/n/k size
if (arg_m != -1 || arg_n != -1 || arg_k != -1) {
printf("* Tuning for one specific size: m=%d, n=%d, k=%d\n", arg_m, arg_n, arg_k);
if (arg_m == -1 || arg_n == -1 || arg_k == -1) {
printf("* Error: If one of m/n/k specified, please specify all three\n");
return;
}
TuneGemmSingleSize<T>(platform, device, context, queue, static_cast<size_t>(arg_m),
static_cast<size_t>(arg_n), static_cast<size_t>(arg_k), num_runs);
}
else {
// Run the tuners for the XGEMM routines
TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
64, 2048, 64, 1, num_runs,
"gemm", "GemmRoutine", "gemm_routine", "XGEMM_MIN_INDIRECT_SIZE");
//TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmBatchedRoutine<T, 30>,
// 16, 128, 32, 30, num_runs,
// "gemmbatched", "GemmRoutine", "gemm_routine_2", "XGEMMBATCHED_MIN_INDIRECT_SIZE");
//TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmStridedBatchedRoutine<T, 30>,
// 16, 128, 32, 30, num_runs,
// "gemmstridedbatched", "GemmRoutine", "gemm_routine_3", "XGEMMSTRIDEDBATCHED_MIN_INDIRECT_SIZE");
}
printf("* Completed tuning process\n");
printf("\n");

View File

@ -122,6 +122,7 @@ constexpr auto kArgHelp = "h";
constexpr auto kArgQuiet = "q";
constexpr auto kArgNoAbbreviations = "no_abbrv";
constexpr auto kArgNumRuns = "runs";
constexpr auto kArgFullStatistics = "full_statistics";
// The buffer names
constexpr auto kBufVecX = "X";
@ -245,6 +246,7 @@ struct Arguments {
size_t num_steps = 0;
size_t num_runs = 10;
std::vector<std::string> tuner_files = {};
bool full_statistics = false;
#ifdef CLBLAST_REF_CUBLAS
void* cublas_handle; // cublasHandle_t
#endif

View File

@ -17,6 +17,7 @@
#include <algorithm>
#include <chrono>
#include <random>
#include <tuning/tuning.hpp>
#include "utilities/utilities.hpp"
#include "test/performance/client.hpp"
@ -145,6 +146,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le
args.print_help = CheckArgument(command_line_args, help, kArgHelp);
args.silent = CheckArgument(command_line_args, help, kArgQuiet);
args.no_abbrv = CheckArgument(command_line_args, help, kArgNoAbbreviations);
args.full_statistics= CheckArgument(command_line_args, help, kArgFullStatistics);
warm_up_ = CheckArgument(command_line_args, help, kArgWarmUp);
// Parse the optional JSON file name arguments
@ -253,32 +255,32 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar};
// Runs the routines and collects the timings
auto timings = std::vector<std::pair<std::string, double>>();
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
auto timings = std::vector<std::pair<std::string, TimeResult>>();
auto time_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
timings.push_back(std::pair<std::string, TimeResult>("CLBlast", time_clblast));
if (args.compare_clblas) {
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
auto time_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
timings.push_back(std::pair<std::string, TimeResult>("clBLAS", time_clblas));
}
if (args.compare_cblas) {
auto buffers_host = BuffersHost<T>();
DeviceToHost(args, buffers, buffers_host, queue, buffers_in_);
auto ms_cblas = TimedExecution(args.num_runs, args, buffers_host, queue, run_reference2_, "CPU BLAS");
auto time_cblas = TimedExecution(args.num_runs, args, buffers_host, queue, run_reference2_, "CPU BLAS");
HostToDevice(args, buffers, buffers_host, queue, buffers_out_);
timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas));
timings.push_back(std::pair<std::string, TimeResult>("CPU BLAS", time_cblas));
}
if (args.compare_cublas) {
auto buffers_host = BuffersHost<T>();
auto buffers_cuda = BuffersCUDA<T>();
DeviceToHost(args, buffers, buffers_host, queue, buffers_in_);
HostToCUDA(args, buffers_cuda, buffers_host, buffers_in_);
auto ms_cublas = 0.0;
TimeResult time_cublas;
try {
ms_cublas = TimedExecution(args.num_runs, args, buffers_cuda, queue, run_reference3_, "cuBLAS");
time_cublas = TimedExecution(args.num_runs, args, buffers_cuda, queue, run_reference3_, "cuBLAS");
} catch (std::runtime_error e) { }
CUDAToHost(args, buffers_cuda, buffers_host, buffers_out_);
HostToDevice(args, buffers, buffers_host, queue, buffers_out_);
timings.push_back(std::pair<std::string, double>("cuBLAS", ms_cublas));
timings.push_back(std::pair<std::string, TimeResult>("cuBLAS", time_cublas));
}
// Prints the performance of the tested libraries
@ -311,9 +313,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
// value found in the vector of timing results. The return value is in milliseconds.
template <typename T, typename U>
template <typename BufferType, typename RoutineType>
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
BufferType &buffers, Queue &queue,
RoutineType run_blas, const std::string &library_name) {
typename Client<T,U>::TimeResult Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
BufferType &buffers, Queue &queue,
RoutineType run_blas, const std::string &library_name) {
auto status = StatusCode::kSuccess;
// Do an optional warm-up to omit compilation times and initialisations from the measurements
@ -343,7 +345,19 @@ double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &ar
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
}
return *std::min_element(timings.begin(), timings.end());
// Compute statistics
auto result = TimeResult();
const auto sum = std::accumulate(timings.begin(), timings.end(), 0.0);
const auto mean = sum / timings.size();
std::vector<double> diff(timings.size());
std::transform(timings.begin(), timings.end(), diff.begin(), [mean](double x) { return x - mean; });
const auto sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
result.mean = mean;
result.standard_deviation = std::sqrt(sq_sum / timings.size());
result.minimum = *std::min_element(timings.begin(), timings.end());
result.maximum = *std::max_element(timings.begin(), timings.end());
return result;
}
// =================================================================================================
@ -355,26 +369,42 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
// First line (optional)
if (!args.silent) {
for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
fprintf(stdout, " | <-- CLBlast -->");
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); }
if (args.compare_cublas) { fprintf(stdout, " | <-- cuBLAS -->"); }
if (args.full_statistics) {
fprintf(stdout, " | <-- CLBlast -->");
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); }
if (args.compare_cublas) { fprintf(stdout, " | <-- cuBLAS -->"); }
}
else {
fprintf(stdout, " | <-- CLBlast -->");
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); }
if (args.compare_cublas) { fprintf(stdout, " | <-- cuBLAS -->"); }
}
fprintf(stdout, " |\n");
}
// Second line
for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_4", "GFLOPS_4", "GBs_4"); }
if (args.full_statistics) {
fprintf(stdout, "%9s;%9s;%9s;%9s", "min_ms_1", "max_ms_1", "mean_1", "stddev_1");
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_2", "max_ms_2", "mean_2", "stddev_2"); }
if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_3", "max_ms_3", "mean_3", "stddev_3"); }
if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_4", "max_ms_4", "mean_4", "stddev_4"); }
}
else {
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_4", "GFLOPS_4", "GBs_4"); }
}
fprintf(stdout, "\n");
}
// Print a performance-result row
template <typename T, typename U>
void Client<T,U>::PrintTableRow(const Arguments<U>& args,
const std::vector<std::pair<std::string, double>>& timings) {
const std::vector<std::pair<std::string, TimeResult>>& timings) {
// Creates a vector of relevant variables
auto integers = std::vector<size_t>{};
@ -443,16 +473,26 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args,
// Loops over all tested libraries
for (const auto& timing : timings) {
const auto library_name = timing.first;
const auto minimum_ms = timing.second.minimum;
if (library_name != "CLBlast") { fprintf(stdout, ";"); }
// Computes the GFLOPS and GB/s metrics
auto flops = get_flops_(args);
auto bytes = get_bytes_(args);
auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0;
auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0;
// Either output full statistics
if (args.full_statistics) {
const auto maximum_ms = timing.second.maximum;
const auto mean_ms = timing.second.mean;
const auto standard_deviation = timing.second.standard_deviation;
fprintf(stdout, "%9.3lf;%9.3lf;%9.3lf;%9.3lf", minimum_ms, maximum_ms, mean_ms, standard_deviation);
}
// Outputs the performance numbers
if (timing.first != "CLBlast") { fprintf(stdout, ";"); }
fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", timing.second, gflops, gbs);
// ... or outputs minimum time and the GFLOPS and GB/s metrics
else {
const auto flops = get_flops_(args);
const auto bytes = get_bytes_(args);
const auto gflops = (minimum_ms != 0.0) ? (flops*1e-6)/minimum_ms : 0;
const auto gbs = (minimum_ms != 0.0) ? (bytes*1e-6)/minimum_ms : 0;
fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", minimum_ms, gflops, gbs);
}
}
fprintf(stdout, "\n");
}

View File

@ -42,6 +42,7 @@ template <typename T, typename U>
class Client {
public:
static const int kSeed;
struct TimeResult { double minimum; double maximum; double mean; double standard_deviation; };
// Shorthand for the routine-specific functions passed to the tester
using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
@ -72,15 +73,15 @@ class Client {
// Runs a function a given number of times and returns the execution time of the shortest instance
template <typename BufferType, typename RoutineType>
double TimedExecution(const size_t num_runs, const Arguments<U> &args, BufferType &buffers,
Queue &queue, RoutineType run_blas, const std::string &library_name);
TimeResult TimedExecution(const size_t num_runs, const Arguments<U> &args, BufferType &buffers,
Queue &queue, RoutineType run_blas, const std::string &library_name);
// Prints the header of a performance-data table
void PrintTableHeader(const Arguments<U>& args);
// Prints a row of performance data, including results of two libraries
void PrintTableRow(const Arguments<U>& args,
const std::vector<std::pair<std::string, double>>& timings);
const std::vector<std::pair<std::string, TimeResult>>& timings);
// The routine-specific functions passed to the tester
const Routine run_routine_;

View File

@ -17,7 +17,11 @@
extern "C"
{
#include <cblas.h>
#ifdef CLBLAST_REF_CBLAS_MKL
#include <mkl_cblas.h>
#else
#include <cblas.h>
#endif
}
#include "utilities/utilities.hpp"