Merge branch 'master' into CLBlast-267-convgemm
commit
1c9a741470
|
@ -57,8 +57,8 @@ build_script:
|
|||
|
||||
after_build:
|
||||
- ps: pushd $env:CLBLAST_BUILD
|
||||
- 7z a CLBlast-1.3.0-Windows-x64.zip .\install_dir\*
|
||||
- ps: mv CLBlast-1.3.0-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
|
||||
- 7z a CLBlast-1.4.0-Windows-x64.zip .\install_dir\*
|
||||
- ps: mv CLBlast-1.4.0-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
|
||||
|
||||
artifacts:
|
||||
- path: '*.zip'
|
||||
|
|
|
@ -21,7 +21,7 @@ matrix:
|
|||
|
||||
env:
|
||||
global:
|
||||
- CLBLAST_VERSION=1.3.0
|
||||
- CLBLAST_VERSION=1.4.0
|
||||
- CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
|
||||
- CLBLAST_INSTALL=${TRAVIS_BUILD_DIR}/bin/CLBlast-${CLBLAST_VERSION}
|
||||
- CLBLAST_TAR=CLBlast-${CLBLAST_VERSION}-${TRAVIS_OS_NAME}-x64.tar.gz
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
Development (next version)
|
||||
Version 1.4.0
|
||||
- Added Python interface to CLBlast 'PyCLBlast'
|
||||
- Added CLBlast to Ubuntu PPA and macOS Homebrew package managers
|
||||
- Added an API to run the tuners programmatically without any I/O
|
||||
|
@ -8,7 +8,8 @@ Development (next version)
|
|||
- Re-added a local memory size constraint to the tuners
|
||||
- The routine tuners now automatically pick up tuning results from disk from the kernel tuners
|
||||
- Updated and reorganised the CLBlast documentation
|
||||
- Added a 'canary' region to check for overflows in the tuner and tests (insipred by clARMOR)
|
||||
- Added a 'canary' region to check for overflows in the tuner and tests (inspired by clARMOR)
|
||||
- Added an option to test against and compare performance with Intel's MKL
|
||||
- Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program
|
||||
- Fixed incorrect releasing of the OpenCL program resulting in segfaults / access violations
|
||||
- Various minor fixes and enhancements
|
||||
|
|
|
@ -21,7 +21,7 @@ endif()
|
|||
# CMake project details
|
||||
project("clblast" C CXX)
|
||||
set(clblast_VERSION_MAJOR 1)
|
||||
set(clblast_VERSION_MINOR 3)
|
||||
set(clblast_VERSION_MINOR 4)
|
||||
set(clblast_VERSION_PATCH 0)
|
||||
set(clblast_VERSION "${clblast_VERSION_MAJOR}.${clblast_VERSION_MINOR}.${clblast_VERSION_PATCH}")
|
||||
set(clblast_SOVERSION ${clblast_VERSION_MAJOR})
|
||||
|
@ -170,16 +170,17 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL Android)
|
|||
else()
|
||||
|
||||
# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake",
|
||||
# "FindCBLAS.cmake" and "FindcuBLAS.cmake" are included.
|
||||
# "FindCBLAS.cmake", "FindMKL.cmake", and "FindcuBLAS.cmake" are included.
|
||||
if(CLIENTS OR TESTS)
|
||||
find_package(CBLAS)
|
||||
find_package(MKL)
|
||||
if(OPENCL)
|
||||
find_package(clBLAS)
|
||||
endif()
|
||||
if(CUBLAS)
|
||||
find_package(cuBLAS)
|
||||
endif()
|
||||
if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
|
||||
if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND AND NOT MKL_FOUND)
|
||||
if(TESTS)
|
||||
message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
|
||||
set(TESTS OFF)
|
||||
|
@ -423,12 +424,14 @@ if(TUNERS)
|
|||
target_include_directories(clblast_tuner_${KERNEL} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
|
||||
install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
|
||||
endforeach()
|
||||
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
|
||||
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp test/test_utilities.cpp)
|
||||
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
|
||||
target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS} ${clblast_SOURCE_DIR})
|
||||
install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
|
||||
endforeach()
|
||||
if(OPENCL)
|
||||
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
|
||||
add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp test/test_utilities.cpp)
|
||||
target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
|
||||
target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS} ${clblast_SOURCE_DIR})
|
||||
install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
# Adds 'alltuners' target: runs all tuners for all precisions
|
||||
set(ALLTUNERS )
|
||||
|
@ -439,12 +442,14 @@ if(TUNERS)
|
|||
endforeach()
|
||||
set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL})
|
||||
endforeach()
|
||||
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
|
||||
foreach(PRECISION ${PRECISIONS})
|
||||
set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION})
|
||||
if(OPENCL)
|
||||
foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
|
||||
foreach(PRECISION ${PRECISIONS})
|
||||
set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION})
|
||||
endforeach()
|
||||
set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER})
|
||||
endforeach()
|
||||
set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER})
|
||||
endforeach()
|
||||
endif()
|
||||
add_custom_target(alltuners ${ALLTUNERS} DEPENDS ${ALLTUNERSDEPENDS})
|
||||
|
||||
endif()
|
||||
|
@ -468,9 +473,19 @@ if(CLIENTS OR TESTS)
|
|||
add_definitions(" -DCLBLAST_REF_CLBLAS")
|
||||
endif()
|
||||
endif()
|
||||
if(CBLAS_FOUND)
|
||||
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
|
||||
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
|
||||
if(CBLAS_FOUND OR MKL_FOUND)
|
||||
if(MKL_FOUND) # prefers MKL over another CBLAS implementation
|
||||
set(REF_INCLUDES ${REF_INCLUDES} ${MKL_INCLUDE_DIRS})
|
||||
set(REF_LIBRARIES ${REF_LIBRARIES} ${MKL_LIBRARIES})
|
||||
if(MSVC)
|
||||
add_definitions(" /DCLBLAST_REF_CBLAS_MKL")
|
||||
else()
|
||||
add_definitions(" -DCLBLAST_REF_CBLAS_MKL")
|
||||
endif()
|
||||
else()
|
||||
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
|
||||
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
|
||||
endif()
|
||||
set(WRAPPERS ${WRAPPERS} test/wrapper_cblas.hpp)
|
||||
if(MSVC)
|
||||
add_definitions(" /DCLBLAST_REF_CBLAS")
|
||||
|
|
|
@ -2,9 +2,9 @@
|
|||
CLBlast: The tuned OpenCL BLAS library
|
||||
================
|
||||
|
||||
| | Build status | Tests on Intel CPU | Tests on NVIDIA GPU | Tests on Intel GPU |
|
||||
| | Build status | Tests on Intel CPU | Tests on NVIDIA GPU | Other tests |
|
||||
|-----|-----|-----|-----|-----|
|
||||
| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-i7-4790k) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-nvidia-k5000.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-nvidia-k5000) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-HD4600.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-HD4600) |
|
||||
| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-i7-4790k) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-nvidia-k5000.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-nvidia-k5000) | N/A |
|
||||
| Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-intel-e5-2620-v4.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-intel-e5-2620-v4) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-k80.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-k80) | N/A |
|
||||
| OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-osx-intel-i5-4278U.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-osx-intel-i5-4278U) | N/A | N/A |
|
||||
|
||||
|
|
|
@ -18,8 +18,8 @@ This file gives an overview of the main features planned for addition to CLBlast
|
|||
| [#223](https://github.com/CNugteren/CLBlast/issues/223) | Feb '18 | CNugteren | ✔ | Python OpenCL interface |
|
||||
| [#237](https://github.com/CNugteren/CLBlast/issues/237) | Mar '18 | CNugteren | ✔ | Making tuning possible from the CLBlast API |
|
||||
| [#228](https://github.com/CNugteren/CLBlast/issues/228) | Mar-Apr '18 | CNugteren | ✔ | Improving performance for Qualcomm Adreno GPUs |
|
||||
| [#267](https://github.com/CNugteren/CLBlast/issues/267) | May '18 | CNugteren | | Merge im2col and GEMM into a direct kernel |
|
||||
| [#270](https://github.com/CNugteren/CLBlast/issues/270) | July '18 | CNugteren | | Implement col2im |
|
||||
| - | July '18 | CNugteren | | Add a SYCL interface to the library |
|
||||
| [#267](https://github.com/CNugteren/CLBlast/issues/267) | July '18 | CNugteren | | Merge im2col and GEMM into a direct kernel |
|
||||
| [#270](https://github.com/CNugteren/CLBlast/issues/270) | Aug '18 | CNugteren | | Implement col2im |
|
||||
| - | Aug '18 | CNugteren | | Add a SYCL interface to the library |
|
||||
| [#136](https://github.com/CNugteren/CLBlast/issues/136) | ?? | CNugteren | | Implement xAXPBY and xSET |
|
||||
| [#169](https://github.com/CNugteren/CLBlast/issues/169) | ?? | dividiti | | Problem-specific tuning parameter selection |
|
||||
|
|
|
@ -48,7 +48,7 @@ mark_as_advanced(CBLAS_INCLUDE_DIRS)
|
|||
|
||||
# Finds the library
|
||||
find_library(CBLAS_LIBRARIES
|
||||
NAMES cblas blas mkl blis openblas accelerate
|
||||
NAMES cblas blas blis openblas accelerate
|
||||
HINTS ${CBLAS_HINTS}
|
||||
PATH_SUFFIXES
|
||||
lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
|
||||
# ==================================================================================================
|
||||
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
# width of 100 characters per line.
|
||||
#
|
||||
# Author(s):
|
||||
# Cedric Nugteren <www.cedricnugteren.nl>
|
||||
#
|
||||
# ==================================================================================================
|
||||
#
|
||||
# Defines the following variables:
|
||||
# MKL_FOUND Boolean holding whether or not the Intel MKL BLAS library was found
|
||||
# MKL_INCLUDE_DIRS The Intel MKL BLAS include directory
|
||||
# MKL_LIBRARIES The Intel MKL BLAS library
|
||||
#
|
||||
# In case MKL is not installed in the default directory, set the MKL_ROOT variable to point to
|
||||
# the root of MKL, such that 'mkl_cblas.h' can be found in $MKL_ROOT/include. This can either be
|
||||
# done using an environmental variable (e.g. export MKL_ROOT=/path/to/MKL) or using a CMake
|
||||
# variable (e.g. cmake -DMKL_ROOT=/path/to/MKL ..).
|
||||
#
|
||||
# ==================================================================================================
|
||||
|
||||
# Sets the possible install locations
|
||||
set(MKL_HINTS
|
||||
${MKL_ROOT}
|
||||
$ENV{MKL_ROOT}
|
||||
)
|
||||
set(MKL_PATHS
|
||||
/usr
|
||||
/usr/local
|
||||
/usr/local/opt
|
||||
/usr/local/mkl
|
||||
/opt/intel
|
||||
/opt/intel/mkl
|
||||
)
|
||||
|
||||
# Finds the include directories
|
||||
find_path(MKL_INCLUDE_DIRS
|
||||
NAMES mkl_cblas.h
|
||||
HINTS ${MKL_HINTS}
|
||||
PATH_SUFFIXES
|
||||
include inc include/x86_64 include/x64
|
||||
PATHS ${MKL_PATHS}
|
||||
DOC "Intel MKL CBLAS include header mkl_cblas.h"
|
||||
)
|
||||
mark_as_advanced(MKL_INCLUDE_DIRS)
|
||||
|
||||
# Finds the libraries
|
||||
set(MKL_LIB_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import lib/intel64)
|
||||
find_library(MKL_LIBRARIES_LP64 NAMES mkl_intel_lp64 HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL lp64 library")
|
||||
find_library(MKL_LIBRARIES_THREAD NAMES mkl_intel_thread HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL thread library")
|
||||
find_library(MKL_LIBRARIES_CORE NAMES mkl_core HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel MKL core library")
|
||||
find_library(MKL_LIBRARIES_OMP NAMES iomp5 HINTS ${MKL_HINTS} PATH_SUFFIXES ${MKL_LIB_SUFFIXES} PATHS ${MKL_PATHS} DOC "Intel OpenMP library")
|
||||
set(MKL_LIBRARIES ${MKL_LIBRARIES_LP64} ${MKL_LIBRARIES_THREAD} ${MKL_LIBRARIES_CORE} ${MKL_LIBRARIES_OMP})
|
||||
mark_as_advanced(MKL_LIBRARIES)
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
# Notification messages
|
||||
if(NOT MKL_INCLUDE_DIRS)
|
||||
message(STATUS "Could NOT find 'mkl_cblas.h', install MKL or set MKL_ROOT")
|
||||
endif()
|
||||
if(NOT MKL_LIBRARIES)
|
||||
message(STATUS "Could NOT find the Intel MKL BLAS library, install it or set MKL_ROOT")
|
||||
endif()
|
||||
|
||||
# Determines whether or not MKL was found
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(MKL DEFAULT_MSG MKL_INCLUDE_DIRS MKL_LIBRARIES)
|
||||
|
||||
# ==================================================================================================
|
|
@ -84,6 +84,74 @@ The kernels `gemm` and `gemm_direct` have too many parameters to explore. Theref
|
|||
|
||||
There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. However, they do automatically pick up kernel tuning results from the current folder if there are any. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
|
||||
|
||||
Here are all the tuners included in the `make alltuners` target (in the same order) with all their precision arguments:
|
||||
|
||||
./clblast_tuner_copy_fast -precision 32
|
||||
./clblast_tuner_copy_fast -precision 64
|
||||
./clblast_tuner_copy_fast -precision 3232
|
||||
./clblast_tuner_copy_fast -precision 6464
|
||||
./clblast_tuner_copy_fast -precision 16
|
||||
./clblast_tuner_copy_pad -precision 32
|
||||
./clblast_tuner_copy_pad -precision 64
|
||||
./clblast_tuner_copy_pad -precision 3232
|
||||
./clblast_tuner_copy_pad -precision 6464
|
||||
./clblast_tuner_copy_pad -precision 16
|
||||
./clblast_tuner_transpose_fast -precision 32
|
||||
./clblast_tuner_transpose_fast -precision 64
|
||||
./clblast_tuner_transpose_fast -precision 3232
|
||||
./clblast_tuner_transpose_fast -precision 6464
|
||||
./clblast_tuner_transpose_fast -precision 16
|
||||
./clblast_tuner_transpose_pad -precision 32
|
||||
./clblast_tuner_transpose_pad -precision 64
|
||||
./clblast_tuner_transpose_pad -precision 3232
|
||||
./clblast_tuner_transpose_pad -precision 6464
|
||||
./clblast_tuner_transpose_pad -precision 16
|
||||
./clblast_tuner_xaxpy -precision 32
|
||||
./clblast_tuner_xaxpy -precision 64
|
||||
./clblast_tuner_xaxpy -precision 3232
|
||||
./clblast_tuner_xaxpy -precision 6464
|
||||
./clblast_tuner_xaxpy -precision 16
|
||||
./clblast_tuner_xdot -precision 32
|
||||
./clblast_tuner_xdot -precision 64
|
||||
./clblast_tuner_xdot -precision 3232
|
||||
./clblast_tuner_xdot -precision 6464
|
||||
./clblast_tuner_xdot -precision 16
|
||||
./clblast_tuner_xger -precision 32
|
||||
./clblast_tuner_xger -precision 64
|
||||
./clblast_tuner_xger -precision 3232
|
||||
./clblast_tuner_xger -precision 6464
|
||||
./clblast_tuner_xger -precision 16
|
||||
./clblast_tuner_xgemm -precision 32
|
||||
./clblast_tuner_xgemm -precision 64
|
||||
./clblast_tuner_xgemm -precision 3232
|
||||
./clblast_tuner_xgemm -precision 6464
|
||||
./clblast_tuner_xgemm -precision 16
|
||||
./clblast_tuner_xgemm_direct -precision 32
|
||||
./clblast_tuner_xgemm_direct -precision 64
|
||||
./clblast_tuner_xgemm_direct -precision 3232
|
||||
./clblast_tuner_xgemm_direct -precision 6464
|
||||
./clblast_tuner_xgemm_direct -precision 16
|
||||
./clblast_tuner_xgemv -precision 32
|
||||
./clblast_tuner_xgemv -precision 64
|
||||
./clblast_tuner_xgemv -precision 3232
|
||||
./clblast_tuner_xgemv -precision 6464
|
||||
./clblast_tuner_xgemv -precision 16
|
||||
./clblast_tuner_invert -precision 32
|
||||
./clblast_tuner_invert -precision 64
|
||||
./clblast_tuner_invert -precision 3232
|
||||
./clblast_tuner_invert -precision 6464
|
||||
./clblast_tuner_invert -precision 16
|
||||
./clblast_tuner_routine_xgemm -precision 32
|
||||
./clblast_tuner_routine_xgemm -precision 64
|
||||
./clblast_tuner_routine_xgemm -precision 3232
|
||||
./clblast_tuner_routine_xgemm -precision 6464
|
||||
./clblast_tuner_routine_xgemm -precision 16
|
||||
./clblast_tuner_routine_xtrsv -precision 32
|
||||
./clblast_tuner_routine_xtrsv -precision 64
|
||||
./clblast_tuner_routine_xtrsv -precision 3232
|
||||
./clblast_tuner_routine_xtrsv -precision 6464
|
||||
./clblast_tuner_routine_xtrsv -precision 16
|
||||
|
||||
|
||||
Using the tuning results
|
||||
-------------
|
||||
|
|
|
@ -678,8 +678,8 @@ public:
|
|||
}
|
||||
|
||||
// Regular constructor with memory management
|
||||
explicit Kernel(const Program &program, const std::string &name): name_(name) {
|
||||
CheckError(cuModuleGetFunction(&kernel_, program.GetModule(), name.c_str()));
|
||||
explicit Kernel(const std::shared_ptr<Program> program, const std::string &name): name_(name) {
|
||||
CheckError(cuModuleGetFunction(&kernel_, program->GetModule(), name.c_str()));
|
||||
}
|
||||
|
||||
// Sets a kernel argument at the indicated position. This stores both the value of the argument
|
||||
|
|
|
@ -41,7 +41,7 @@ const DatabaseEntry XgerApple = {
|
|||
"Xger", Precision::kAny, {"WGS1", "WGS2", "WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
|
||||
};
|
||||
const DatabaseEntry XtrsvApple = {
|
||||
"Xtrsv", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
|
||||
"Xtrsv", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
|
||||
};
|
||||
const DatabaseEntry XgemmApple = {
|
||||
"Xgemm", Precision::kAny, {"GEMMK", "KREG", "KWG", "KWI", "MDIMA", "MDIMC", "MWG", "NDIMB", "NDIMC", "NWG", "SA", "SB", "STRM", "STRN", "VWM", "VWN"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1 } } } } } } }
|
||||
|
@ -62,7 +62,10 @@ const DatabaseEntry PadtransposeApple = {
|
|||
"Padtranspose", Precision::kAny, {"PADTRA_PAD", "PADTRA_TILE", "PADTRA_WPT"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
|
||||
};
|
||||
const DatabaseEntry InvertApple = {
|
||||
"Invert", Precision::kAny, {"INTERNAL_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
|
||||
"Invert", Precision::kAny, {"INTERNAL_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
|
||||
};
|
||||
const DatabaseEntry TrsvRoutineApple = {
|
||||
"TrsvRoutine", Precision::kAny, {"TRSV_BLOCK_SIZE"}, { { kDeviceTypeAll, "default", { { "default", { { kDeviceNameDefault, Params{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } } } } }
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
|
|
@ -45,7 +45,8 @@ const std::vector<database::DatabaseEntry> Database::apple_cpu_fallback = std::v
|
|||
database::XgemvApple, database::XgemvFastApple, database::XgemvFastRotApple, database::XgerApple, database::XtrsvApple,
|
||||
database::XgemmApple, database::XgemmDirectApple,
|
||||
database::CopyApple, database::PadApple, database::TransposeApple, database::PadtransposeApple,
|
||||
database::InvertApple
|
||||
database::InvertApple,
|
||||
database::TrsvRoutineApple
|
||||
};
|
||||
|
||||
// The default values
|
||||
|
@ -98,7 +99,8 @@ Database::Database(const Device &device, const std::string &kernel_name,
|
|||
if (device.Type() == "CPU") {
|
||||
const auto extensions = device.Capabilities();
|
||||
const auto is_apple = (extensions.find("cl_APPLE_SetMemObjectDestructor") == std::string::npos) ? false : true;
|
||||
if (is_apple) {
|
||||
const auto is_likely_apple = device.MaxWorkGroupSize() <= 32;
|
||||
if (is_apple || is_likely_apple) {
|
||||
databases.push_front(apple_cpu_fallback);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,7 @@ R"(
|
|||
// =================================================================================================
|
||||
#if defined(ROUTINE_TRSV)
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
__kernel
|
||||
void FillVector(const int n, const int inc, const int offset,
|
||||
__global real* restrict dest, const real_arg arg_value) {
|
||||
const real value = GetRealArg(arg_value);
|
||||
|
|
|
@ -19,7 +19,7 @@ R"(
|
|||
#if defined(ROUTINE_INVERT)
|
||||
|
||||
// B21 = A21 * B11
|
||||
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
|
@ -28,7 +28,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in
|
|||
}
|
||||
|
||||
// B21 = -B22 * B21
|
||||
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
|
@ -36,7 +36,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s
|
|||
}
|
||||
|
||||
// B21 = A21 * B11
|
||||
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
|
@ -45,7 +45,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in
|
|||
}
|
||||
|
||||
// B21 = -B22 * B21
|
||||
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
|
@ -53,7 +53,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s
|
|||
}
|
||||
|
||||
// B21 = A21 * B11
|
||||
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
|
@ -62,7 +62,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in
|
|||
}
|
||||
|
||||
// B21 = -B22 * B21
|
||||
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
|
@ -72,7 +72,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
|
|||
// =================================================================================================
|
||||
|
||||
// B12 = A12 * B22
|
||||
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
|
@ -81,7 +81,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in
|
|||
}
|
||||
|
||||
// B12 = -B11 * B12
|
||||
__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
|
@ -89,7 +89,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s
|
|||
}
|
||||
|
||||
// B12 = A12 * B22
|
||||
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
|
@ -98,7 +98,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in
|
|||
}
|
||||
|
||||
// B12 = -B11 * B12
|
||||
__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
|
@ -106,7 +106,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s
|
|||
}
|
||||
|
||||
// B12 = A12 * B22
|
||||
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
|
||||
__global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
|
@ -115,7 +115,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in
|
|||
}
|
||||
|
||||
// B12 = -B11 * B12
|
||||
__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
|
||||
__kernel
|
||||
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
|
||||
{
|
||||
__local real lm[LOCALY * LOCALX];
|
||||
|
|
|
@ -76,7 +76,7 @@ R"(
|
|||
// =================================================================================================
|
||||
#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
__kernel
|
||||
void FillMatrix(const int m, const int n, const int ld, const int offset,
|
||||
__global real* restrict dest, const real_arg arg_value) {
|
||||
const real value = GetRealArg(arg_value);
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
|
||||
#include "routines/common.hpp"
|
||||
|
||||
|
@ -38,13 +39,22 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
|
|||
auto local_size = size_t{1};
|
||||
for (auto &item: local) { local_size *= item; }
|
||||
if (local_size > device.MaxWorkGroupSize()) {
|
||||
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
|
||||
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal,
|
||||
ToString(local_size) + " is larger than " + ToString(device.MaxWorkGroupSize()));
|
||||
}
|
||||
|
||||
// Make sure the global thread sizes are at least equal to the local sizes
|
||||
for (auto i=size_t{0}; i<global.size(); ++i) {
|
||||
if (global[i] < local[i]) { global[i] = local[i]; }
|
||||
}
|
||||
|
||||
// Verify that the global thread sizes are a multiple of the local sizes
|
||||
for (auto i=size_t{0}; i<global.size(); ++i) {
|
||||
if ((global[i] / local[i]) * local[i] != global[i]) {
|
||||
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim,
|
||||
ToString(global[i]) + " is not divisible by " + ToString(local[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tests for local memory usage
|
||||
|
@ -77,11 +87,10 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
|
|||
// Sets all elements of a matrix to a constant value
|
||||
template <typename T>
|
||||
void FillMatrix(Queue &queue, const Device &device,
|
||||
const std::shared_ptr<Program> program, const Databases &,
|
||||
const std::shared_ptr<Program> program,
|
||||
EventPointer event, const std::vector<Event> &waitForEvents,
|
||||
const size_t m, const size_t n, const size_t ld, const size_t offset,
|
||||
const Buffer<T> &dest,
|
||||
const T constant_value) {
|
||||
const Buffer<T> &dest, const T constant_value, const size_t local_size) {
|
||||
auto kernel = Kernel(program, "FillMatrix");
|
||||
kernel.SetArgument(0, static_cast<int>(m));
|
||||
kernel.SetArgument(1, static_cast<int>(n));
|
||||
|
@ -89,63 +98,62 @@ void FillMatrix(Queue &queue, const Device &device,
|
|||
kernel.SetArgument(3, static_cast<int>(offset));
|
||||
kernel.SetArgument(4, dest());
|
||||
kernel.SetArgument(5, GetRealArg(constant_value));
|
||||
auto local = std::vector<size_t>{16, 1};
|
||||
auto global = std::vector<size_t>{Ceil(m, 16), n};
|
||||
auto local = std::vector<size_t>{local_size, 1};
|
||||
auto global = std::vector<size_t>{Ceil(m, local_size), n};
|
||||
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
|
||||
}
|
||||
|
||||
// Compiles the above function
|
||||
template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
|
||||
template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>,
|
||||
EventPointer, const std::vector<Event>&, const size_t, const size_t,
|
||||
const size_t, const size_t, const Buffer<half>&, const half);
|
||||
template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
|
||||
const size_t, const size_t, const Buffer<half>&, const half, const size_t);
|
||||
template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>,
|
||||
EventPointer, const std::vector<Event>&, const size_t, const size_t,
|
||||
const size_t, const size_t, const Buffer<float>&, const float);
|
||||
template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
|
||||
const size_t, const size_t, const Buffer<float>&, const float, const size_t);
|
||||
template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>,
|
||||
EventPointer, const std::vector<Event>&, const size_t, const size_t,
|
||||
const size_t, const size_t, const Buffer<double>&, const double);
|
||||
template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
|
||||
const size_t, const size_t, const Buffer<double>&, const double, const size_t);
|
||||
template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>,
|
||||
EventPointer, const std::vector<Event>&, const size_t, const size_t,
|
||||
const size_t, const size_t, const Buffer<float2>&, const float2);
|
||||
template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
|
||||
const size_t, const size_t, const Buffer<float2>&, const float2, const size_t);
|
||||
template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>,
|
||||
EventPointer, const std::vector<Event>&, const size_t, const size_t,
|
||||
const size_t, const size_t, const Buffer<double2>&, const double2);
|
||||
const size_t, const size_t, const Buffer<double2>&, const double2, const size_t);
|
||||
|
||||
// Sets all elements of a vector to a constant value
|
||||
template <typename T>
|
||||
void FillVector(Queue &queue, const Device &device,
|
||||
const std::shared_ptr<Program> program, const Databases &,
|
||||
const std::shared_ptr<Program> program,
|
||||
EventPointer event, const std::vector<Event> &waitForEvents,
|
||||
const size_t n, const size_t inc, const size_t offset,
|
||||
const Buffer<T> &dest,
|
||||
const T constant_value) {
|
||||
const Buffer<T> &dest, const T constant_value, const size_t local_size) {
|
||||
auto kernel = Kernel(program, "FillVector");
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, static_cast<int>(inc));
|
||||
kernel.SetArgument(2, static_cast<int>(offset));
|
||||
kernel.SetArgument(3, dest());
|
||||
kernel.SetArgument(4, GetRealArg(constant_value));
|
||||
auto local = std::vector<size_t>{16};
|
||||
auto global = std::vector<size_t>{Ceil(n, 16)};
|
||||
auto local = std::vector<size_t>{local_size};
|
||||
auto global = std::vector<size_t>{Ceil(n, local_size)};
|
||||
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
|
||||
}
|
||||
|
||||
// Compiles the above function
|
||||
template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
|
||||
template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>,
|
||||
EventPointer, const std::vector<Event>&, const size_t, const size_t,
|
||||
const size_t, const Buffer<half>&, const half);
|
||||
template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
|
||||
const size_t, const Buffer<half>&, const half, const size_t);
|
||||
template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>,
|
||||
EventPointer, const std::vector<Event>&, const size_t, const size_t,
|
||||
const size_t, const Buffer<float>&, const float);
|
||||
template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
|
||||
const size_t, const Buffer<float>&, const float, const size_t);
|
||||
template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>,
|
||||
EventPointer, const std::vector<Event>&, const size_t, const size_t,
|
||||
const size_t, const Buffer<double>&, const double);
|
||||
template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
|
||||
const size_t, const Buffer<double>&, const double, const size_t);
|
||||
template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>,
|
||||
EventPointer, const std::vector<Event>&, const size_t, const size_t,
|
||||
const size_t, const Buffer<float2>&, const float2);
|
||||
template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
|
||||
const size_t, const Buffer<float2>&, const float2, const size_t);
|
||||
template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>,
|
||||
EventPointer, const std::vector<Event>&, const size_t, const size_t,
|
||||
const size_t, const Buffer<double2>&, const double2);
|
||||
const size_t, const Buffer<double2>&, const double2, const size_t);
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
|
|
@ -36,20 +36,18 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
|
|||
// Sets all elements of a matrix to a constant value
|
||||
template <typename T>
|
||||
void FillMatrix(Queue &queue, const Device &device,
|
||||
const std::shared_ptr<Program> program, const Databases &,
|
||||
const std::shared_ptr<Program> program,
|
||||
EventPointer event, const std::vector<Event> &waitForEvents,
|
||||
const size_t m, const size_t n, const size_t ld, const size_t offset,
|
||||
const Buffer<T> &dest,
|
||||
const T constant_value);
|
||||
const Buffer<T> &dest, const T constant_value, const size_t local_size);
|
||||
|
||||
// Sets all elements of a vector to a constant value
|
||||
template <typename T>
|
||||
void FillVector(Queue &queue, const Device &device,
|
||||
const std::shared_ptr<Program> program, const Databases &,
|
||||
const std::shared_ptr<Program> program,
|
||||
EventPointer event, const std::vector<Event> &waitForEvents,
|
||||
const size_t n, const size_t inc, const size_t offset,
|
||||
const Buffer<T> &dest,
|
||||
const T constant_value);
|
||||
const Buffer<T> &dest, const T constant_value, const size_t local_size);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
|
|
@ -68,7 +68,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
|
|||
|
||||
// Launches the kernel
|
||||
const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
|
||||
const auto global = std::vector<size_t>{1};
|
||||
const auto global = std::vector<size_t>{Ceil(n, db_["TRSV_BLOCK_SIZE"])};
|
||||
auto event = Event();
|
||||
RunKernel(kernel, queue_, device_, global, local, event.pointer());
|
||||
event.WaitForCompletion();
|
||||
|
@ -87,6 +87,11 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
|
|||
// Makes sure all dimensions are larger than zero
|
||||
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
|
||||
|
||||
// Some parts of this kernel are not tunable and thus require some minimal OpenCL properties
|
||||
if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16
|
||||
throw RuntimeErrorCode(StatusCode::kNotImplemented);
|
||||
}
|
||||
|
||||
// Tests the matrix and vector
|
||||
TestMatrixA(n, n, a_buffer, a_offset, a_ld);
|
||||
TestVectorX(n, b_buffer, b_offset, b_inc);
|
||||
|
@ -102,8 +107,8 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
|
|||
// Fills the output buffer with zeros
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto fill_vector_event = Event();
|
||||
FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList,
|
||||
n, x_inc, x_offset, x_buffer, ConstantZero<T>());
|
||||
FillVector(queue_, device_, program_, fill_vector_event.pointer(), eventWaitList,
|
||||
n, x_inc, x_offset, x_buffer, ConstantZero<T>(), 16);
|
||||
fill_vector_event.WaitForCompletion();
|
||||
|
||||
// Derives properties based on the arguments
|
||||
|
|
|
@ -25,9 +25,9 @@ class Xgemm: public Routine {
|
|||
public:
|
||||
|
||||
// Defines the assumptions of the GEMM kernels
|
||||
static const bool a_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
|
||||
static const bool b_want_rotated_(const size_t gemm_kernel_id) { return true; }
|
||||
static const bool c_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
|
||||
static bool a_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
|
||||
static bool b_want_rotated_(const size_t) { return true; }
|
||||
static bool c_want_rotated_(const size_t gemm_kernel_id) { return gemm_kernel_id == 1; }
|
||||
|
||||
// Computes the size of the temporary GEMM buffer based on user-arguments
|
||||
static size_t GetTempSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
||||
|
|
|
@ -78,6 +78,11 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
|
|||
// Makes sure all dimensions are larger than zero
|
||||
if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
|
||||
|
||||
// Some parts of this kernel are not tunable and thus require some minimal OpenCL properties
|
||||
if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16
|
||||
throw RuntimeErrorCode(StatusCode::kNotImplemented);
|
||||
}
|
||||
|
||||
// Computes the k dimension. This is based on whether or not matrix is A (on the left)
|
||||
// or B (on the right) in the Xgemm routine.
|
||||
const auto k = (side == Side::kLeft) ? m : n;
|
||||
|
@ -105,8 +110,8 @@ void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
|
|||
// Fills the output buffer with zeros
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto fill_matrix_event = Event();
|
||||
FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList,
|
||||
x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero<T>());
|
||||
FillMatrix(queue_, device_, program_, fill_matrix_event.pointer(), eventWaitList,
|
||||
x_one, x_two, x_ld, x_offset, x_buffer, ConstantZero<T>(), 16);
|
||||
fill_matrix_event.WaitForCompletion();
|
||||
|
||||
// Inverts the diagonal blocks
|
||||
|
|
|
@ -49,9 +49,16 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
|
|||
throw BLASError(StatusCode::kInvalidDimension);
|
||||
}
|
||||
|
||||
// Some parts of this kernel are not tunable and thus require some minimal OpenCL properties
|
||||
if (device_.MaxWorkGroupSize() < 16) { // minimum of total local work size of 16
|
||||
throw RuntimeErrorCode(StatusCode::kNotImplemented);
|
||||
}
|
||||
|
||||
// Helper variables
|
||||
const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
|
||||
assert(internal_block_size == 16);
|
||||
if (internal_block_size != 16) {
|
||||
throw RuntimeErrorCode(StatusCode::kNotImplemented); // e.g. Apple CPU OpenCL with a WGS of 1
|
||||
} // when barriers are present
|
||||
const auto num_blocks = CeilDiv(n, block_size);
|
||||
const auto num_internal_blocks = CeilDiv(n, internal_block_size);
|
||||
const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
|
||||
|
@ -75,8 +82,9 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
|
|||
// Fills the output buffer with zeros
|
||||
auto event_wait_list = std::vector<Event>();
|
||||
auto fill_matrix_event = Event();
|
||||
FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list,
|
||||
block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
|
||||
FillMatrix(queue_, device_, program_, fill_matrix_event.pointer(), event_wait_list,
|
||||
block_size, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>(),
|
||||
16);
|
||||
event_wait_list.push_back(fill_matrix_event);
|
||||
|
||||
// Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
|
||||
|
@ -89,11 +97,11 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
|
|||
kernel.SetArgument(5, static_cast<int>(block_size));
|
||||
kernel.SetArgument(6, static_cast<int>(unit_diagonal));
|
||||
kernel.SetArgument(7, static_cast<int>(is_upper));
|
||||
const auto local = std::vector<size_t>{internal_block_size};
|
||||
const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
|
||||
const auto local_invert = std::vector<size_t>{internal_block_size};
|
||||
const auto global_invert = std::vector<size_t>{num_internal_blocks * internal_block_size};
|
||||
auto base_kernel_event = Event();
|
||||
auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
|
||||
RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
|
||||
RunKernel(kernel, queue_, device_, global_invert, local_invert, base_kernel_event_pointer, event_wait_list);
|
||||
if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
|
||||
|
||||
// Builds up block_size x block_size blocks. For example, internal_block_size=16:
|
||||
|
@ -107,7 +115,8 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
|
|||
const auto npages = CeilDiv(n, current_size*2);
|
||||
const auto local0 = (current_size <= 32) ? current_size/4 : 16;
|
||||
const auto local = std::vector<size_t>{local0, 4};
|
||||
const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
|
||||
const auto global = std::vector<size_t>{Ceil(current_size/local[1], local[0]),
|
||||
Ceil(npages*(current_size/16)*local[1], local[1])};
|
||||
|
||||
// Part 1
|
||||
auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
|
||||
|
|
|
@ -25,14 +25,15 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
template <typename T>
|
||||
void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
|
||||
void RunGemmRoutineMNK(const size_t m, const size_t n, const size_t k,
|
||||
const Queue& queue, const std::vector<Buffer<T>>& buffers) {
|
||||
auto queue_plain = queue();
|
||||
auto event = cl_event{};
|
||||
auto status = Gemm(Layout::kRowMajor, Transpose::kNo, Transpose::kNo,
|
||||
value, value, value, ConstantOne<T>(),
|
||||
buffers[0](), 0, value,
|
||||
buffers[1](), 0, value, ConstantOne<T>(),
|
||||
buffers[2](), 0, value,
|
||||
m, n, k, ConstantOne<T>(),
|
||||
buffers[0](), 0, k,
|
||||
buffers[1](), 0, n, ConstantOne<T>(),
|
||||
buffers[2](), 0, n,
|
||||
&queue_plain, &event);
|
||||
if (status != StatusCode::kSuccess) {
|
||||
throw RuntimeError("Gemm failed with status " + ToString(status));
|
||||
|
@ -40,6 +41,10 @@ void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Bu
|
|||
clWaitForEvents(1, &event);
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
template <typename T>
|
||||
void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
|
||||
RunGemmRoutineMNK(value, value, value, queue, buffers);
|
||||
}
|
||||
|
||||
template <typename T, size_t batch_count>
|
||||
void RunGemmBatchedRoutine(const size_t value, const Queue& queue, const std::vector<Buffer<T>>& buffers) {
|
||||
|
@ -80,6 +85,55 @@ void RunGemmStridedBatchedRoutine(const size_t value, const Queue& queue, const
|
|||
clWaitForEvents(1, &event);
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
// =================================================================================================
|
||||
|
||||
template <typename T>
|
||||
void TuneGemmSingleSize(const Platform& platform, const Device& device, const Context& context, Queue& queue,
|
||||
const size_t m, const size_t n, const size_t k, const size_t num_runs) {
|
||||
|
||||
// Buffers
|
||||
auto buffers = std::vector<Buffer<T>>{
|
||||
Buffer<T>(context, m * k),
|
||||
Buffer<T>(context, k * n),
|
||||
Buffer<T>(context, m * n)
|
||||
};
|
||||
const auto FunctionToTune = [&]() { RunGemmRoutineMNK(m, n, k, queue, buffers); };
|
||||
|
||||
// Collects the timings for two methods
|
||||
auto scores = std::vector<TuningResult>();
|
||||
const auto methods = std::vector<std::string>{"in-direct", "direct"};
|
||||
for (auto& method: methods) {
|
||||
|
||||
printf("* Testing the %s routine\n", method.c_str());
|
||||
const auto limit = (method == "in-direct") ? 0 : std::max(std::max(m, n), k) + 1; // small or large number
|
||||
ForceSelectIndirectFrom<T>(limit, device, "GemmRoutine", "XGEMM_MIN_INDIRECT_SIZE");
|
||||
auto time_ms = -1.0;
|
||||
try {
|
||||
time_ms = TimeFunction(num_runs, FunctionToTune);
|
||||
printf(" --> %9.2lf ms\n", time_ms);
|
||||
}
|
||||
catch (...) {
|
||||
const auto status_code = DispatchExceptionCatchAll(true);
|
||||
printf(" --> error %-5d\n", static_cast<int>(status_code));
|
||||
}
|
||||
auto tuning_results = Configuration();
|
||||
tuning_results["XGEMM_MIN_INDIRECT_SIZE"] = limit;
|
||||
tuning_results["PRECISION"] = static_cast<size_t>(PrecisionValue<T>());
|
||||
scores.push_back(TuningResult{"gemm_kernel_selection_single_size", time_ms, tuning_results});
|
||||
}
|
||||
|
||||
// Outputs the results as JSON to disk, including some meta-data
|
||||
const auto precision_string = std::to_string(static_cast<size_t>(PrecisionValue<T>()));
|
||||
auto metadata = std::vector<std::pair<std::string,std::string>>{
|
||||
{"kernel_family", "gemm_routine_single_size"},
|
||||
{"precision", precision_string},
|
||||
{"arg_m", ToString(m)},
|
||||
{"arg_n", ToString(n)},
|
||||
{"arg_k", ToString(k)},
|
||||
};
|
||||
PrintTimingsToFileAsJSON("clblast_gemm_routine_single_size_" + precision_string + ".json",
|
||||
device, platform, metadata, scores);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
@ -91,6 +145,9 @@ void TuneXgemm(int argc, char* argv[]) {
|
|||
const auto device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
|
||||
const auto precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle);
|
||||
const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, size_t{10});
|
||||
const auto arg_m = GetArgument(command_line_args, help, kArgM, -1); // optional
|
||||
const auto arg_n = GetArgument(command_line_args, help, kArgN, -1); // optional
|
||||
const auto arg_k = GetArgument(command_line_args, help, kArgK, -1); // optional
|
||||
fprintf(stdout, "%s\n", help.c_str());
|
||||
|
||||
// OpenCL initialisation
|
||||
|
@ -119,16 +176,29 @@ void TuneXgemm(int argc, char* argv[]) {
|
|||
}
|
||||
}
|
||||
|
||||
// Run the tuners for the XGEMM routines
|
||||
TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
|
||||
64, 2048, 64, 1, num_runs,
|
||||
"gemm", "GemmRoutine", "gemm_routine", "XGEMM_MIN_INDIRECT_SIZE");
|
||||
//TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmBatchedRoutine<T, 30>,
|
||||
// 16, 128, 32, 30, num_runs,
|
||||
// "gemmbatched", "GemmRoutine", "gemm_routine_2", "XGEMMBATCHED_MIN_INDIRECT_SIZE");
|
||||
//TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmStridedBatchedRoutine<T, 30>,
|
||||
// 16, 128, 32, 30, num_runs,
|
||||
// "gemmstridedbatched", "GemmRoutine", "gemm_routine_3", "XGEMMSTRIDEDBATCHED_MIN_INDIRECT_SIZE");
|
||||
// Test for only one m/n/k size
|
||||
if (arg_m != -1 || arg_n != -1 || arg_k != -1) {
|
||||
printf("* Tuning for one specific size: m=%d, n=%d, k=%d\n", arg_m, arg_n, arg_k);
|
||||
if (arg_m == -1 || arg_n == -1 || arg_k == -1) {
|
||||
printf("* Error: If one of m/n/k specified, please specify all three\n");
|
||||
return;
|
||||
}
|
||||
TuneGemmSingleSize<T>(platform, device, context, queue, static_cast<size_t>(arg_m),
|
||||
static_cast<size_t>(arg_n), static_cast<size_t>(arg_k), num_runs);
|
||||
}
|
||||
|
||||
else {
|
||||
// Run the tuners for the XGEMM routines
|
||||
TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
|
||||
64, 2048, 64, 1, num_runs,
|
||||
"gemm", "GemmRoutine", "gemm_routine", "XGEMM_MIN_INDIRECT_SIZE");
|
||||
//TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmBatchedRoutine<T, 30>,
|
||||
// 16, 128, 32, 30, num_runs,
|
||||
// "gemmbatched", "GemmRoutine", "gemm_routine_2", "XGEMMBATCHED_MIN_INDIRECT_SIZE");
|
||||
//TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmStridedBatchedRoutine<T, 30>,
|
||||
// 16, 128, 32, 30, num_runs,
|
||||
// "gemmstridedbatched", "GemmRoutine", "gemm_routine_3", "XGEMMSTRIDEDBATCHED_MIN_INDIRECT_SIZE");
|
||||
}
|
||||
|
||||
printf("* Completed tuning process\n");
|
||||
printf("\n");
|
||||
|
|
|
@ -122,6 +122,7 @@ constexpr auto kArgHelp = "h";
|
|||
constexpr auto kArgQuiet = "q";
|
||||
constexpr auto kArgNoAbbreviations = "no_abbrv";
|
||||
constexpr auto kArgNumRuns = "runs";
|
||||
constexpr auto kArgFullStatistics = "full_statistics";
|
||||
|
||||
// The buffer names
|
||||
constexpr auto kBufVecX = "X";
|
||||
|
@ -245,6 +246,7 @@ struct Arguments {
|
|||
size_t num_steps = 0;
|
||||
size_t num_runs = 10;
|
||||
std::vector<std::string> tuner_files = {};
|
||||
bool full_statistics = false;
|
||||
#ifdef CLBLAST_REF_CUBLAS
|
||||
void* cublas_handle; // cublasHandle_t
|
||||
#endif
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <random>
|
||||
#include <tuning/tuning.hpp>
|
||||
|
||||
#include "utilities/utilities.hpp"
|
||||
#include "test/performance/client.hpp"
|
||||
|
@ -145,6 +146,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le
|
|||
args.print_help = CheckArgument(command_line_args, help, kArgHelp);
|
||||
args.silent = CheckArgument(command_line_args, help, kArgQuiet);
|
||||
args.no_abbrv = CheckArgument(command_line_args, help, kArgNoAbbreviations);
|
||||
args.full_statistics= CheckArgument(command_line_args, help, kArgFullStatistics);
|
||||
warm_up_ = CheckArgument(command_line_args, help, kArgWarmUp);
|
||||
|
||||
// Parse the optional JSON file name arguments
|
||||
|
@ -253,32 +255,32 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
|||
auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar};
|
||||
|
||||
// Runs the routines and collects the timings
|
||||
auto timings = std::vector<std::pair<std::string, double>>();
|
||||
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
|
||||
timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
|
||||
auto timings = std::vector<std::pair<std::string, TimeResult>>();
|
||||
auto time_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
|
||||
timings.push_back(std::pair<std::string, TimeResult>("CLBlast", time_clblast));
|
||||
if (args.compare_clblas) {
|
||||
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
|
||||
timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
|
||||
auto time_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
|
||||
timings.push_back(std::pair<std::string, TimeResult>("clBLAS", time_clblas));
|
||||
}
|
||||
if (args.compare_cblas) {
|
||||
auto buffers_host = BuffersHost<T>();
|
||||
DeviceToHost(args, buffers, buffers_host, queue, buffers_in_);
|
||||
auto ms_cblas = TimedExecution(args.num_runs, args, buffers_host, queue, run_reference2_, "CPU BLAS");
|
||||
auto time_cblas = TimedExecution(args.num_runs, args, buffers_host, queue, run_reference2_, "CPU BLAS");
|
||||
HostToDevice(args, buffers, buffers_host, queue, buffers_out_);
|
||||
timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas));
|
||||
timings.push_back(std::pair<std::string, TimeResult>("CPU BLAS", time_cblas));
|
||||
}
|
||||
if (args.compare_cublas) {
|
||||
auto buffers_host = BuffersHost<T>();
|
||||
auto buffers_cuda = BuffersCUDA<T>();
|
||||
DeviceToHost(args, buffers, buffers_host, queue, buffers_in_);
|
||||
HostToCUDA(args, buffers_cuda, buffers_host, buffers_in_);
|
||||
auto ms_cublas = 0.0;
|
||||
TimeResult time_cublas;
|
||||
try {
|
||||
ms_cublas = TimedExecution(args.num_runs, args, buffers_cuda, queue, run_reference3_, "cuBLAS");
|
||||
time_cublas = TimedExecution(args.num_runs, args, buffers_cuda, queue, run_reference3_, "cuBLAS");
|
||||
} catch (std::runtime_error e) { }
|
||||
CUDAToHost(args, buffers_cuda, buffers_host, buffers_out_);
|
||||
HostToDevice(args, buffers, buffers_host, queue, buffers_out_);
|
||||
timings.push_back(std::pair<std::string, double>("cuBLAS", ms_cublas));
|
||||
timings.push_back(std::pair<std::string, TimeResult>("cuBLAS", time_cublas));
|
||||
}
|
||||
|
||||
// Prints the performance of the tested libraries
|
||||
|
@ -311,9 +313,9 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
|||
// value found in the vector of timing results. The return value is in milliseconds.
|
||||
template <typename T, typename U>
|
||||
template <typename BufferType, typename RoutineType>
|
||||
double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
|
||||
BufferType &buffers, Queue &queue,
|
||||
RoutineType run_blas, const std::string &library_name) {
|
||||
typename Client<T,U>::TimeResult Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
|
||||
BufferType &buffers, Queue &queue,
|
||||
RoutineType run_blas, const std::string &library_name) {
|
||||
auto status = StatusCode::kSuccess;
|
||||
|
||||
// Do an optional warm-up to omit compilation times and initialisations from the measurements
|
||||
|
@ -343,7 +345,19 @@ double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &ar
|
|||
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
|
||||
timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
|
||||
}
|
||||
return *std::min_element(timings.begin(), timings.end());
|
||||
|
||||
// Compute statistics
|
||||
auto result = TimeResult();
|
||||
const auto sum = std::accumulate(timings.begin(), timings.end(), 0.0);
|
||||
const auto mean = sum / timings.size();
|
||||
std::vector<double> diff(timings.size());
|
||||
std::transform(timings.begin(), timings.end(), diff.begin(), [mean](double x) { return x - mean; });
|
||||
const auto sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
|
||||
result.mean = mean;
|
||||
result.standard_deviation = std::sqrt(sq_sum / timings.size());
|
||||
result.minimum = *std::min_element(timings.begin(), timings.end());
|
||||
result.maximum = *std::max_element(timings.begin(), timings.end());
|
||||
return result;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
@ -355,26 +369,42 @@ void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
|
|||
// First line (optional)
|
||||
if (!args.silent) {
|
||||
for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
|
||||
fprintf(stdout, " | <-- CLBlast -->");
|
||||
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
|
||||
if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); }
|
||||
if (args.compare_cublas) { fprintf(stdout, " | <-- cuBLAS -->"); }
|
||||
if (args.full_statistics) {
|
||||
fprintf(stdout, " | <-- CLBlast -->");
|
||||
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
|
||||
if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); }
|
||||
if (args.compare_cublas) { fprintf(stdout, " | <-- cuBLAS -->"); }
|
||||
}
|
||||
else {
|
||||
fprintf(stdout, " | <-- CLBlast -->");
|
||||
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
|
||||
if (args.compare_cblas) { fprintf(stdout, " | <-- CPU BLAS -->"); }
|
||||
if (args.compare_cublas) { fprintf(stdout, " | <-- cuBLAS -->"); }
|
||||
}
|
||||
fprintf(stdout, " |\n");
|
||||
}
|
||||
|
||||
// Second line
|
||||
for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
|
||||
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
|
||||
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
|
||||
if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
|
||||
if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_4", "GFLOPS_4", "GBs_4"); }
|
||||
if (args.full_statistics) {
|
||||
fprintf(stdout, "%9s;%9s;%9s;%9s", "min_ms_1", "max_ms_1", "mean_1", "stddev_1");
|
||||
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_2", "max_ms_2", "mean_2", "stddev_2"); }
|
||||
if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_3", "max_ms_3", "mean_3", "stddev_3"); }
|
||||
if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s;%9s", "min_ms_4", "max_ms_4", "mean_4", "stddev_4"); }
|
||||
}
|
||||
else {
|
||||
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
|
||||
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
|
||||
if (args.compare_cblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_3", "GFLOPS_3", "GBs_3"); }
|
||||
if (args.compare_cublas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_4", "GFLOPS_4", "GBs_4"); }
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
|
||||
// Print a performance-result row
|
||||
template <typename T, typename U>
|
||||
void Client<T,U>::PrintTableRow(const Arguments<U>& args,
|
||||
const std::vector<std::pair<std::string, double>>& timings) {
|
||||
const std::vector<std::pair<std::string, TimeResult>>& timings) {
|
||||
|
||||
// Creates a vector of relevant variables
|
||||
auto integers = std::vector<size_t>{};
|
||||
|
@ -443,16 +473,26 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args,
|
|||
|
||||
// Loops over all tested libraries
|
||||
for (const auto& timing : timings) {
|
||||
const auto library_name = timing.first;
|
||||
const auto minimum_ms = timing.second.minimum;
|
||||
if (library_name != "CLBlast") { fprintf(stdout, ";"); }
|
||||
|
||||
// Computes the GFLOPS and GB/s metrics
|
||||
auto flops = get_flops_(args);
|
||||
auto bytes = get_bytes_(args);
|
||||
auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0;
|
||||
auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0;
|
||||
// Either output full statistics
|
||||
if (args.full_statistics) {
|
||||
const auto maximum_ms = timing.second.maximum;
|
||||
const auto mean_ms = timing.second.mean;
|
||||
const auto standard_deviation = timing.second.standard_deviation;
|
||||
fprintf(stdout, "%9.3lf;%9.3lf;%9.3lf;%9.3lf", minimum_ms, maximum_ms, mean_ms, standard_deviation);
|
||||
}
|
||||
|
||||
// Outputs the performance numbers
|
||||
if (timing.first != "CLBlast") { fprintf(stdout, ";"); }
|
||||
fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", timing.second, gflops, gbs);
|
||||
// ... or outputs minimum time and the GFLOPS and GB/s metrics
|
||||
else {
|
||||
const auto flops = get_flops_(args);
|
||||
const auto bytes = get_bytes_(args);
|
||||
const auto gflops = (minimum_ms != 0.0) ? (flops*1e-6)/minimum_ms : 0;
|
||||
const auto gbs = (minimum_ms != 0.0) ? (bytes*1e-6)/minimum_ms : 0;
|
||||
fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", minimum_ms, gflops, gbs);
|
||||
}
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
|
|
|
@ -42,6 +42,7 @@ template <typename T, typename U>
|
|||
class Client {
|
||||
public:
|
||||
static const int kSeed;
|
||||
struct TimeResult { double minimum; double maximum; double mean; double standard_deviation; };
|
||||
|
||||
// Shorthand for the routine-specific functions passed to the tester
|
||||
using Routine = std::function<StatusCode(const Arguments<U>&, Buffers<T>&, Queue&)>;
|
||||
|
@ -72,15 +73,15 @@ class Client {
|
|||
|
||||
// Runs a function a given number of times and returns the execution time of the shortest instance
|
||||
template <typename BufferType, typename RoutineType>
|
||||
double TimedExecution(const size_t num_runs, const Arguments<U> &args, BufferType &buffers,
|
||||
Queue &queue, RoutineType run_blas, const std::string &library_name);
|
||||
TimeResult TimedExecution(const size_t num_runs, const Arguments<U> &args, BufferType &buffers,
|
||||
Queue &queue, RoutineType run_blas, const std::string &library_name);
|
||||
|
||||
// Prints the header of a performance-data table
|
||||
void PrintTableHeader(const Arguments<U>& args);
|
||||
|
||||
// Prints a row of performance data, including results of two libraries
|
||||
void PrintTableRow(const Arguments<U>& args,
|
||||
const std::vector<std::pair<std::string, double>>& timings);
|
||||
const std::vector<std::pair<std::string, TimeResult>>& timings);
|
||||
|
||||
// The routine-specific functions passed to the tester
|
||||
const Routine run_routine_;
|
||||
|
|
|
@ -17,7 +17,11 @@
|
|||
|
||||
extern "C"
|
||||
{
|
||||
#include <cblas.h>
|
||||
#ifdef CLBLAST_REF_CBLAS_MKL
|
||||
#include <mkl_cblas.h>
|
||||
#else
|
||||
#include <cblas.h>
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "utilities/utilities.hpp"
|
||||
|
|
Loading…
Reference in New Issue