Merge pull request #150 from CNugteren/development

Update to version 0.11.0
pull/156/head 0.11.0
Cedric Nugteren 2017-05-02 22:39:50 +02:00 committed by GitHub
commit 606f2871dd
291 changed files with 12269 additions and 3527 deletions

View File

@ -2,14 +2,6 @@ language: cpp
sudo: required
dist: trusty
os:
- linux
- osx
compiler:
- gcc
- clang
addons:
apt:
sources:
@ -19,6 +11,14 @@ addons:
- cmake
- ocl-icd-opencl-dev
matrix:
include:
- os: linux
compiler: gcc
- os: linux
compiler: clang
- os: osx
env:
global:
- CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast

View File

@ -1,4 +1,25 @@
Version 0.11.0
- Improved the internal program source and binary caches for scalability and speed (thanks to 'intelfx')
- Fixed a bug having to re-create the binary even if it was in the cache
- Fixed a bug when using offsets in the direct version of the GEMM kernels
- Fixed a missing cl_khr_fp64 when running double-precision on Intel CPUs
- Fixed tests on Apple's CPU OpenCL implementation; still not fast but correct at least
- Fixed bugs in the half-precision routines HTBMV/HTPMV/HTRMV/HSYR2K/HTRMM
- Tests now also exit with an error code when OpenCL errors or compilation errors occur
- Tests now also check for the L2 error in case of half-precision
- Clients can now test against cuBLAS on NVIDIA systems for performance comparisons (-DCUBLAS=ON)
- Replaced the R graph scripts with Python/Matplotlib scripts
- Various minor fixes and enhancements
- Added tuned parameters for various devices (see README)
- Added the OverrideParameters function to the API to be able to supply custom tuning parmeters
- Added triangular solver (level-2 & level-3) routines:
* STRSV/DTRSV/CTRSV/ZTRSV (experimental, un-optimized)
* STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized)
- Added batched (not part of the BLAS standard) routines:
* SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED (batched version of AXPY)
* SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED (batched version of GEMM)
Version 0.10.0
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
- Changed the enums in the C API to avoid potential name clashes with external code

View File

@ -18,7 +18,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
# CMake project details
project("clblast" C CXX)
set(clblast_VERSION_MAJOR 0)
set(clblast_VERSION_MINOR 10)
set(clblast_VERSION_MINOR 11)
set(clblast_VERSION_PATCH 0)
# Options and their default values
@ -28,6 +28,7 @@ option(TUNERS "Enable compilation of the tuners" OFF)
option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
option(TESTS "Enable compilation of the correctness tests" OFF)
option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF)
# Compile in verbose mode with additional diagnostic messages
option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
@ -129,11 +130,14 @@ if(TUNERS)
endif()
endif()
# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
# and "FindCBLAS.cmake" are included.
# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake",
# "FindCBLAS.cmake" and "FindcuBLAS.cmake" are included.
if(CLIENTS OR TESTS)
find_package(clBLAS)
find_package(CBLAS)
if(CUBLAS)
find_package(cuBLAS)
endif()
if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
if(TESTS)
message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
@ -156,10 +160,10 @@ if(NETLIB)
set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
endif()
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
set(LEVELX_ROUTINES xomatcopy)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm)
set(LEVELX_ROUTINES xomatcopy xaxpybatched xgemmbatched)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
set(PRECISIONS 32 64 3232 6464 16)
@ -175,6 +179,7 @@ set(SOURCES
src/clblast.cpp
src/clblast_c.cpp
src/routine.cpp
src/routines/levelx/xinvert.cpp # only source, don't include it as a test
)
if(NETLIB)
set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
@ -241,7 +246,7 @@ endif()
if(SAMPLES)
# Downloads the cl.hpp file from Khronos
file(DOWNLOAD https://www.khronos.org/registry/cl/api/1.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
file(DOWNLOAD https://www.khronos.org/registry/OpenCL/api/2.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
# Adds sample programs (C++)
foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
@ -319,13 +324,22 @@ if(CLIENTS OR TESTS)
add_definitions(" -DCLBLAST_REF_CBLAS")
endif()
endif()
if(CUBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CUDA_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CUDA_LIBRARIES} ${CUBLAS_LIBRARIES})
if(MSVC)
add_definitions(" /DCLBLAST_REF_CUBLAS")
else()
add_definitions(" -DCLBLAST_REF_CUBLAS")
endif()
endif()
endif()
# ==================================================================================================
# Section for the performance tests (i.e. the client). These compare against optionally a reference
# library, either clBLAS or a CPU BLAS.
# library, either clBLAS, a CPU BLAS, or CUDA's cuBLAS.
if(CLIENTS)
# Visual Studio requires the sources of non-exported objects/libraries
@ -371,7 +385,7 @@ endif()
# ==================================================================================================
# Section for the correctness tests. Note that these tests require the presence of clBLAS and/or a
# CPU BLAS library to act as a reference.
# CPU BLAS library, and/or cuBLAS to act as a reference.
if(TESTS)
enable_testing()
@ -414,6 +428,18 @@ if(TESTS)
add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE})
endforeach()
# Miscellaneous tests
set(MISC_TESTS override_parameters)
foreach(MISC_TEST ${MISC_TESTS})
add_executable(clblast_test_${MISC_TEST} ${TESTS_COMMON}
test/correctness/misc/${MISC_TEST}.cpp)
target_link_libraries(clblast_test_${MISC_TEST} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
target_include_directories(clblast_test_${MISC_TEST} PUBLIC
$<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
${clblast_SOURCE_DIR} ${REF_INCLUDES})
add_test(clblast_test_${MISC_TEST} clblast_test_${MISC_TEST})
endforeach()
# Adds 'alltests' target: runs all tests
set(ALLTESTS )
set(ALLTESTSDEPENDS )

View File

@ -21,6 +21,7 @@ Use CLBlast instead of clBLAS:
* When you want to be able to inspect the BLAS kernels or easily customize them to your needs.
* When you run on exotic OpenCL devices for which you need to tune yourself.
* When you are still running on OpenCL 1.1 hardware.
* When you prefer a C++ API over a C API (C API also available in CLBlast).
* When you value an organized and modern C++ codebase.
* When you target Intel CPUs and GPUs or embedded devices
* When you can benefit from the increased performance of half-precision fp16 data-types.
@ -90,21 +91,23 @@ Or alternatively the plain C version:
#include <clblast_c.h>
Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows:
Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/clblast.md). The API is kept as close as possible to the Netlib BLAS and the cuBLAS/clBLAS APIs.
To get started quickly, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows:
cmake -DSAMPLES=ON ..
Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:
#include <clblast_netlib_c.h>
For all of CLBlast's APIs, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
Using the tuners (optional)
-------------
The CLBlast library will be tuned in the future for the most commonly used OpenCL devices. This pre-release of CLBlast is only tuned for a limited number of devices, in particular those with the following `CL_DEVICE_NAME` values:
The CLBlast library is already tuned for the most commonly used OpenCL devices and it's gradually being extended to other devices as well. For unseen devices CLBlast will make use of common-best tuning values for similar devices (e.g. AMD GPUs), so performance might still be decent. The current release of CLBlast is tuned for devices with the following `CL_DEVICE_NAME` values:
* NVIDIA GPUs:
- GRID K520
@ -115,18 +118,23 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- GeForce GTX 750 Ti
- GeForce GTX 980
- GeForce GTX 1070
- GeForce GTX 1080
- GeForce GTX TITAN
- GeForce GTX TITAN Black
- GeForce GTX TITAN X
- TITAN X (Pascal)
- Tesla K20m
- Tesla K40m
* AMD GPUs:
- AMD Radeon R9 M370X Compute Engine
- ATI Radeon HD 6750M
- Ellesmere
- Hawaii
- Oland
- Pitcairn
- Tahiti
- Tonga
- Turks
* Intel GPUs:
- HD Graphics 530
- HD Graphics 5500 BroadWell U-Processor GT2
@ -137,7 +145,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- Iris Pro
* Intel CPUs:
- Core i5-6200U
- Core i7-2670QM
- Core i7-3770K
- Core i7-4790K
- Core i7-5930K
* Other devices:
- ARM Mali-T628 GPU
@ -151,7 +161,7 @@ Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https:/
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables.
The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python (2.7 or 3.x) script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):
@ -163,6 +173,8 @@ In summary, tuning the entire library for your device can be done as follows (st
python ../scripts/database/database.py . ..
make
Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. See the [API documentation](doc/clblast.md#overrideparameters-override-tuning-parameters-auxiliary-function) for more details.
Compiling the correctness tests (optional)
-------------
@ -187,15 +199,15 @@ All tests can be run directly together in one go through the `make alltests` tar
Compiling the performance tests/clients (optional)
-------------
To test the performance of CLBlast and compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS) or a CPU BLAS library (see above for requirements), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:
To test the performance of CLBlast and compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS), cuBLAS (if testing on an NVIDIA GPU and `-DCUBLAS=ON` set), or a CPU BLAS library (see above for requirements), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:
cmake -DCLIENTS=ON ..
The performance tests come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against optionally clBLAS and/or a CPU BLAS library. You can use the command-line options `-clblas 1` or `-cblas 1` to select a library to test against.
The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared in this case against a tuned version of the clBLAS library. These graphs can be generated automatically on your own device. First, compile CLBlast with the clients enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `scripts/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0 from the `build` subdirectory:
The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared in this case against a tuned version of the clBLAS library. These graphs can be generated automatically on your own device. First, compile CLBlast with the clients enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable (shipped with clBLAS). Finally, run the Python/Matplotlib graph-script found in `scripts/benchmark/benchmark.py`. For example, to generate the SGEMM PDF on device 1 of platform 0 from the `build` subdirectory:
Rscript ../scripts/graphs/xgemm.r 0 1
python ../scripts/benchmark/benchmark.py --platform 0 --device 1 --benchmark gemm
Note that the CLBlast library provides pre-tuned parameter-values for some devices only: if your device is not among these, then out-of-the-box performance might be poor. See above under `Using the tuners` to find out how to tune for your device.
@ -250,6 +262,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
| xSPR | ✔ | ✔ | - | - | ✔ |
| xSYR2 | ✔ | ✔ | - | - | ✔ |
| xSPR2 | ✔ | ✔ | - | - | ✔ |
| xTRSV | ✔ | ✔ | ✔ | ✔ | | (experimental, un-optimized)
| Level-3 | S | D | C | Z | H |
| ---------|---|---|---|---|---|
@ -261,6 +274,14 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
| xSYR2K | ✔ | ✔ | ✔ | ✔ | ✔ |
| xHER2K | - | - | ✔ | ✔ | - |
| xTRMM | ✔ | ✔ | ✔ | ✔ | ✔ |
| xTRSM | ✔ | ✔ | ✔ | ✔ | | (experimental, un-optimized)
Futhermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance:
| Batched | S | D | C | Z | H |
| -------------|---|---|---|---|---|
| xAXPYBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ |
| xGEMMBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ |
In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:
@ -271,7 +292,7 @@ In addition, some extra non-BLAS routines are also supported by CLBlast, classif
| IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ |
| xOMATCOPY | ✔ | ✔ | ✔ | ✔ | ✔ |
Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTRSV, xTBSV, xTPSV, and xTRSM.
Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.
Half precision (fp16)

View File

@ -0,0 +1,82 @@
# ==================================================================================================
# This file is part of the cuBLASt project. The project is licensed under Apache Version 2.0. This
# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
# width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# ==================================================================================================
#
# Defines the following variables:
# CUBLAS_FOUND Boolean holding whether or not the cuBLAS library was found
# CUBLAS_INCLUDE_DIRS The CUDA and cuBLAS include directory
# CUDA_LIBRARIES The CUDA library
# CUBLAS_LIBRARIES The cuBLAS library
#
# In case CUDA is not installed in the default directory, set the CUDA_ROOT variable to point to
# the root of cuBLAS, such that 'cublas_v2.h' can be found in $CUDA_ROOT/include. This can either be
# done using an environmental variable (e.g. export CUDA_ROOT=/path/to/cuBLAS) or using a CMake
# variable (e.g. cmake -DCUDA_ROOT=/path/to/cuBLAS ..).
#
# ==================================================================================================
# Sets the possible install locations
set(CUBLAS_HINTS
${CUDA_ROOT}
$ENV{CUDA_ROOT}
$ENV{CUDA_TOOLKIT_ROOT_DIR}
)
set(CUBLAS_PATHS
/usr
/usr/local
/usr/local/cuda
)
# Finds the include directories
find_path(CUBLAS_INCLUDE_DIRS
NAMES cublas_v2.h cuda.h
HINTS ${CUBLAS_HINTS}
PATH_SUFFIXES include inc include/x86_64 include/x64
PATHS ${CUBLAS_PATHS}
DOC "cuBLAS include header cublas_v2.h"
)
mark_as_advanced(CUBLAS_INCLUDE_DIRS)
# Finds the libraries
find_library(CUDA_LIBRARIES
NAMES cudart
HINTS ${CUBLAS_HINTS}
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
PATHS ${CUBLAS_PATHS}
DOC "CUDA library"
)
mark_as_advanced(CUDA_LIBRARIES)
find_library(CUBLAS_LIBRARIES
NAMES cublas
HINTS ${CUBLAS_HINTS}
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
PATHS ${CUBLAS_PATHS}
DOC "cuBLAS library"
)
mark_as_advanced(CUBLAS_LIBRARIES)
# ==================================================================================================
# Notification messages
if(NOT CUBLAS_INCLUDE_DIRS)
message(STATUS "Could NOT find 'cuBLAS.h', install CUDA/cuBLAS or set CUDA_ROOT")
endif()
if(NOT CUDA_LIBRARIES)
message(STATUS "Could NOT find CUDA library, install it or set CUDA_ROOT")
endif()
if(NOT CUBLAS_LIBRARIES)
message(STATUS "Could NOT find cuBLAS library, install it or set CUDA_ROOT")
endif()
# Determines whether or not cuBLAS was found
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(cuBLAS DEFAULT_MSG CUBLAS_INCLUDE_DIRS CUDA_LIBRARIES CUBLAS_LIBRARIES)
# ==================================================================================================

View File

@ -1445,6 +1445,63 @@ Arguments to TPMV:
xTRSV: Solves a triangular system of equations
-------------
C++ API:
```
template <typename T>
StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
C API:
```
CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to TRSV:
* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
* `const size_t n`: Integer size argument. This value must be positive.
* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
* `const size_t x_offset`: The offset in elements from the start of the output x vector.
* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
xGER: General rank-1 matrix update
-------------
@ -2708,6 +2765,71 @@ Requirements for TRMM:
xTRSM: Solves a triangular system of equations
-------------
Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.
C++ API:
```
template <typename T>
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
```
C API:
```
CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const float alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const double alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const cl_float2 alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const cl_double2 alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
```
Arguments to TRSM:
* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
* `const size_t m`: Integer size argument. This value must be positive.
* `const size_t n`: Integer size argument. This value must be positive.
* `const T alpha`: Input scalar constant.
* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function)
-------------
@ -2781,3 +2903,231 @@ Requirements for OMATCOPY:
xAXPYBATCHED: Batched version of AXPY
-------------
As AXPY, but multiple operations are batched together for better performance.
C++ API:
```
template <typename T>
StatusCode AxpyBatched(const size_t n,
const T *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
```
C API:
```
CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
const float *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
const double *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
const cl_float2 *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
const cl_double2 *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
const cl_half *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
```
Arguments to AXPYBATCHED:
* `const size_t n`: Integer size argument. This value must be positive.
* `const T *alphas`: Input scalar constants.
* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
* `const size_t *x_offsets`: The offsets in elements from the start of the input x vector.
* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
* `const size_t *y_offsets`: The offsets in elements from the start of the output y vector.
* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
* `const size_t batch_count`: Number of batches. This value must be positive.
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
xGEMMBATCHED: Batched version of GEMM
-------------
As GEMM, but multiple operations are batched together for better performance.
C++ API:
```
template <typename T>
StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const T *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const T *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
```
C API:
```
CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const float *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const float *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const double *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const double *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_float2 *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const cl_float2 *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_double2 *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const cl_double2 *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_half *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const cl_half *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event)
```
Arguments to GEMMBATCHED:
* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
* `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
* `const size_t m`: Integer size argument. This value must be positive.
* `const size_t n`: Integer size argument. This value must be positive.
* `const size_t k`: Integer size argument. This value must be positive.
* `const T *alphas`: Input scalar constants.
* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
* `const size_t *a_offsets`: The offsets in elements from the start of the input A matrix.
* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
* `const size_t *b_offsets`: The offsets in elements from the start of the input B matrix.
* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
* `const T *betas`: Input scalar constants.
* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
* `const size_t *c_offsets`: The offsets in elements from the start of the output C matrix.
* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
* `const size_t batch_count`: Number of batches. This value must be positive.
* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
Requirements for GEMMBATCHED:
* When `transpose_a == Transpose::kNo`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`.
* When `transpose_b == Transpose::kNo`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`.
* The value of `c_ld` must be at least `m`.
ClearCache: Resets the cache of compiled binaries (auxiliary function)
-------------
CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache can be cleared to free up system memory or it can be useful in case of debugging.
C++ API:
```
StatusCode ClearCache()
```
C API:
```
CLBlastStatusCode CLBlastClearCache()
```
FillCache: Populates the cache of compiled binaries for a specific device (auxiliary function)
-------------
CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache is automatically populated whenever a new binary is created. Thus, the first run of a specific kernel could take extra time. For debugging or performance evaluation purposes, it might be useful to populate the cache upfront. This function populates the cache for all kernels in CLBlast for all precisions, but for a specific device only.
C++ API:
```
StatusCode FillCache(const cl_device_id device)
```
C API:
```
CLBlastStatusCode CLBlastFillCache(const cl_device_id device)
```
Arguments to FillCache:
* `const cl_device_id device`: The OpenCL device to fill the cache for.
OverrideParameters: Override tuning parameters (auxiliary function)
-------------
This function overrides tuning parameters for a specific device-precision-kernel combination. The next time the target routine is called it will be re-compiled and use the new parameters. All further times (until `OverrideParameters` is called again) it will load the kernel from the cache and thus continue to use the new parameters. Note that the first time after calling `OverrideParameters` a performance drop can be observable due to the re-compilation of the kernel.
C++ API:
```
StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name,
const Precision precision,
const std::unordered_map<std::string,size_t> &parameters)
```
C API:
```
CLBlastStatusCode CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
const CLBlastPrecision precision, const size_t num_parameters,
const char** parameters_names, const size_t* parameters_values)
```
Arguments to OverrideParameters (C++ version):
* `const cl_device_id device`: The OpenCL device to set the new parameters for.
* `const std::string &kernel_name`: The target kernel name. This has to be one of the existing CLBlast kernels (Xaxpy, Xdot, Xgemv, XgemvFast, XgemvFastRot, Xgemv, Xger, Copy, Pad, Transpose, Padtranspose, Xgemm, or XgemmDirect). If this argument is incorrect, this function will return with the `clblast::kInvalidOverrideKernel` status-code.
* `const Precision precision`: The CLBlast precision enum to set the new parameters for.
* `const std::unordered_map<std::string,size_t> &parameters`: An unordered map of strings to integers. This has to contain all the tuning parameters for a specific kernel as reported by the included tuners (e.g. `{ {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} }` for the `Copy` kernel). If this argument is incorrect, this function will return with the `clblast::kMissingOverrideParameter` status-code.

View File

@ -17,6 +17,8 @@
#define CLBLAST_CLBLAST_H_
#include <cstdlib> // For size_t
#include <string> // For OverrideParameters function
#include <unordered_map> // For OverrideParameters function
// Includes the normal OpenCL C header
#if defined(__APPLE__) || defined(__MACOSX)
@ -95,6 +97,9 @@ enum class StatusCode {
kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
kInvalidBatchCount = -2049, // The batch count needs to be positive
kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel
kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
@ -114,7 +119,7 @@ enum class Side { kLeft = 141, kRight = 142 };
// Precision scoped enum (values in bits)
enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
kComplexSingle = 3232, kComplexDouble = 6464 };
kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 };
// =================================================================================================
// BLAS level-1 (vector-vector) routines
@ -583,7 +588,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event = nullptr);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
template <typename T>
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
@ -605,6 +610,27 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event = nullptr);
// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
template <typename T>
StatusCode AxpyBatched(const size_t n,
const T *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event = nullptr);
// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
template <typename T>
StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const T *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const T *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event = nullptr);
// =================================================================================================
// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
@ -617,6 +643,14 @@ StatusCode PUBLIC_API FillCache(const cl_device_id device);
// =================================================================================================
// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
// the target routine is called it will re-compile and use the new parameters from then on.
StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name,
const Precision precision,
const std::unordered_map<std::string,size_t> &parameters);
// =================================================================================================
} // namespace clblast
// CLBLAST_CLBLAST_H_

View File

@ -96,6 +96,9 @@ typedef enum CLBlastStatusCode_ {
CLBlastInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
CLBlastInvalidBatchCount = -2049, // The batch count needs to be positive
CLBlastInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel
CLBlastMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel
CLBlastInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
CLBlastNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
CLBlastNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
@ -117,6 +120,11 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
// Precision enum (values in bits)
typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32,
CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232,
CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision;
// =================================================================================================
// BLAS level-1 (vector-vector) routines
// =================================================================================================
@ -1258,7 +1266,7 @@ CLBlastStatusCode PUBLIC_API CLBlastHtrmm(const CLBlastLayout layout, const CLBl
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
CLBlastStatusCode PUBLIC_API CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const float alpha,
@ -1283,12 +1291,6 @@ CLBlastStatusCode PUBLIC_API CLBlastZtrsm(const CLBlastLayout layout, const CLBl
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
// =================================================================================================
// Extra non-BLAS routines (level-X)
@ -1326,6 +1328,85 @@ CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n,
const float *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n,
const double *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n,
const cl_float2 *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n,
const cl_double2 *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n,
const cl_half *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
CLBlastStatusCode PUBLIC_API CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const float *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const float *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const double *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const double *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_float2 *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const cl_float2 *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_double2 *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const cl_double2 *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
CLBlastStatusCode PUBLIC_API CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_half *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const cl_half *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event);
// =================================================================================================
// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
@ -1338,6 +1419,14 @@ CLBlastStatusCode PUBLIC_API CLBlastFillCache(const cl_device_id device);
// =================================================================================================
// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
// the target routine is called it will re-compile and use the new parameters from then on.
CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
const CLBlastPrecision precision, const size_t num_parameters,
const char** parameters_names, const size_t* parameters_values);
// =================================================================================================
#ifdef __cplusplus
} // extern "C"
#endif

View File

@ -32,9 +32,8 @@
// =================================================================================================
// Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,
// which is a typedef for unsigned short.
typedef cl_half half;
// The host data-type for half-precision floating-point (16-bit) is based on the `cl_half` OpenCL
// type, which is a typedef for unsigned short.
// 32-bit union for conversions
typedef union ConversionBits_ {
@ -47,7 +46,7 @@ typedef union ConversionBits_ {
// Converts a IEEE-compliant single-precision value to half-precision floating-point. This function
// applies simple truncation (round toward zero, but with overflows set to infinity) as rounding
// mode.
inline half FloatToHalf(const float value) {
inline cl_half FloatToHalf(const float value) {
static const unsigned short base_table[512] = {
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
@ -108,7 +107,7 @@ inline half FloatToHalf(const float value) {
}
// Converts a half-precision value to IEEE-compliant single-precision floating-point
inline float HalfToFloat(const half value) {
inline float HalfToFloat(const cl_half value) {
static const unsigned int mantissa_table[2048] = {
0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,

View File

@ -862,7 +862,7 @@ void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side,
const void* a, const int a_ld,
void* b, const int b_ld);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int m, const int n,
const float alpha,

View File

@ -20,6 +20,8 @@
#include <string.h>
#include <time.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
// Includes the CLBlast library (C interface)
#include <clblast_c.h>

View File

@ -19,6 +19,8 @@
#include <stdio.h>
#include <string.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
// Includes the CLBlast library (C interface)
#include <clblast_c.h>

View File

@ -18,6 +18,8 @@
#include <stdio.h>
#include <string.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
// Includes the CLBlast library (C interface)
#include <clblast_c.h>

View File

@ -19,6 +19,8 @@
#include <stdio.h>
#include <string.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
// Includes the CLBlast library (C interface)
#include <clblast_c.h>

View File

@ -19,6 +19,8 @@
#include <stdio.h>
#include <string.h>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
// Includes the CLBlast library (C interface)
#include <clblast_c.h>

View File

@ -20,6 +20,9 @@
#include <chrono>
#include <vector>
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
// Includes the C++ OpenCL API. If not yet available, it can be found here:
// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
#include "cl.hpp"
@ -103,7 +106,7 @@ int main() {
auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
// Example completed. See "clblast.h" for status codes (0 -> success).
printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, status);
printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
return 0;
}

View File

@ -0,0 +1,151 @@
#!/usr/bin/env python
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import argparse
import json
import os
import sys
import settings
import plot
import utils
EXPERIMENTS = {
"axpy": settings.AXPY,
"axpybatched": settings.AXPYBATCHED,
"gemv": settings.GEMV,
"gemm": settings.GEMM,
"gemm_small": settings.GEMM_SMALL,
"gemmbatched": settings.GEMMBATCHED,
"symm": settings.SYMM,
"syrk": settings.SYRK,
"summary": settings.SUMMARY,
}
def run_benchmark(name, arguments_list, precision, num_runs, platform, device):
binary = "./clblast_client_x" + name
# Loops over sub-benchmarks per benchmark
results = []
for arguments in arguments_list:
# Sets the arguments
constant_arguments = ["-warm_up", "-q", "-no_abbrv", "-cblas 0", "-cublas 0"]
common_arguments = ["-precision %d" % precision, "-runs %d" % num_runs]
opencl_arguments = ["-platform %d" % platform, "-device %d" % device]
all_arguments = opencl_arguments + common_arguments + constant_arguments
for name, value in arguments.items():
all_arguments.append("-" + name + " " + str(value))
# Calls the binary and parses the results
benchmark_output = utils.run_binary(binary, all_arguments)
result = utils.parse_results(benchmark_output)
# For half-precision: also runs single-precision for comparison
if precision == 16:
all_arguments = [arg if arg != "-precision 16" else "-precision 32" for arg in all_arguments]
benchmark_output = utils.run_binary(binary, all_arguments)
result_extra = utils.parse_results(benchmark_output)
for index in range(len(min(result, result_extra))):
result[index]["GBs_1_FP32"] = result_extra[index]["GBs_1"]
result[index]["GBs_2"] = result_extra[index]["GBs_2"]
result[index]["GFLOPS_1_FP32"] = result_extra[index]["GFLOPS_1"]
result[index]["GFLOPS_2"] = result_extra[index]["GFLOPS_2"]
results.extend(result)
return results
def parse_arguments(argv):
parser = argparse.ArgumentParser(description="Runs a full benchmark for a specific routine on a specific device")
parser.add_argument("-b", "--benchmark", required=True, help="The benchmark to perform (choose from %s)" % sorted(EXPERIMENTS.keys()))
parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on")
parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
parser.add_argument("-n", "--num_runs", type=int, default=None, help="Overrides the default number of benchmark repeats for averaging")
parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
parser.add_argument("-t", "--plot_title", default="", help="The title for the plots, defaults to benchmark name")
parser.add_argument("-z", "--tight_plot", action="store_true", help="Enables tight plot layout for in paper or presentation")
parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)")
parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
cl_args = parser.parse_args(argv)
return vars(cl_args)
def benchmark_single(benchmark, platform, device, num_runs, precision, load_from_disk,
plot_title, tight_plot, output_folder, verbose):
# Sanity check
if not os.path.isdir(output_folder):
print("[benchmark] Error: folder '%s' doesn't exist" % output_folder)
return
# The benchmark name and plot title
benchmark_name = utils.precision_to_letter(precision) + benchmark.upper()
if benchmark.upper() != "SUMMARY":
plot_title = benchmark_name if plot_title is "" else benchmark_name + ": " + plot_title
# Retrieves the benchmark settings
if benchmark not in EXPERIMENTS.keys():
print("[benchmark] Invalid benchmark '%s', choose from %s" % (benchmark, EXPERIMENTS.keys()))
return
experiment = EXPERIMENTS[benchmark]
benchmarks = experiment["benchmarks"]
# Either run the benchmarks for this experiment or load old results from disk
json_file_name = os.path.join(output_folder, benchmark_name.lower() + "_benchmarks.json")
if load_from_disk and os.path.isfile(json_file_name):
print("[benchmark] Loading previous benchmark results from '" + json_file_name + "'")
with open(json_file_name) as f:
results = json.load(f)
else:
# Runs all the individual benchmarks
print("[benchmark] Running on platform %d, device %d" % (platform, device))
print("[benchmark] Running %d benchmarks for settings '%s'" % (len(benchmarks), benchmark))
results = {"label_names": experiment["label_names"], "num_rows": experiment["num_rows"],
"num_cols": experiment["num_cols"], "benchmarks": []}
for bench in benchmarks:
num_runs_benchmark = bench["num_runs"] if num_runs is None else num_runs
print("[benchmark] Running benchmark '%s:%s'" % (bench["name"], bench["title"]))
result = run_benchmark(bench["name"], bench["arguments"], precision, num_runs_benchmark,
platform, device)
results["benchmarks"].append(result)
# Stores the results to disk
print("[benchmark] Saving benchmark results to '" + json_file_name + "'")
with open(json_file_name, "wb") as f:
json.dump(results, f, sort_keys=True, indent=4)
# Retrieves the data from the benchmark settings
file_name_suffix = "_tight" if tight_plot else ""
pdf_file_name = os.path.join(output_folder, benchmark_name.lower() + "_plot" + file_name_suffix + ".pdf")
titles = [utils.precision_to_letter(precision) + b["name"].upper() + " " + b["title"] for b in benchmarks]
x_keys = [b["x_keys"] for b in benchmarks]
y_keys = [b["y_keys"] for b in benchmarks]
x_labels = [b["x_label"] for b in benchmarks]
y_labels = [b["y_label"] for b in benchmarks]
label_names = results["label_names"]
# For half-precision: also adds single-precision results for comparison
if precision == 16:
label_names = ["CLBlast FP16", "clBLAS FP32", "CLBlast FP32"]
y_keys = [y_key + [y_key[0] + "_FP32"] for y_key in y_keys]
# Plots the graphs
plot.plot_graphs(results["benchmarks"], pdf_file_name, results["num_rows"], results["num_cols"],
x_keys, y_keys, titles, x_labels, y_labels,
label_names, plot_title, tight_plot, verbose)
print("[benchmark] All done")
if __name__ == '__main__':
parsed_arguments = parse_arguments(sys.argv[1:])
benchmark_single(**parsed_arguments)

View File

@ -0,0 +1,44 @@
#!/usr/bin/env python
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import argparse
import os
import sys
from benchmark import benchmark_single
BENCHMARKS = ["axpy", "gemv", "gemm", "summary", "axpybatched", "gemmbatched"]
def parse_arguments(argv):
parser = argparse.ArgumentParser(description="Runs all (main) benchmarks in one go for a given device")
parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on")
parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
parser.add_argument("-t", "--plot_title", default=None, help="The title for the plots, defaults to benchmark name")
parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)")
parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
cl_args = parser.parse_args(argv)
return vars(cl_args)
def benchmark_all(platform, device, precision, load_from_disk,
plot_title, output_folder, verbose):
for bench in BENCHMARKS:
from_disk = load_from_disk
for tight_plot in [True, False]: # two plots for a single benchmark
benchmark_single(bench, platform, device, None, precision, from_disk,
plot_title, tight_plot, output_folder, verbose)
from_disk = True # for the next plot of the same data
if __name__ == '__main__':
parsed_arguments = parse_arguments(sys.argv[1:])
benchmark_all(**parsed_arguments)

View File

@ -0,0 +1,118 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import utils
import matplotlib
matplotlib.use('Agg')
from matplotlib import rcParams
import matplotlib.pyplot as plt
# Colors
BLUEISH = [c / 255.0 for c in [71, 101, 177]] # #4765b1
REDISH = [c / 255.0 for c in [214, 117, 104]] # #d67568
PURPLISH = [c / 255.0 for c in [85, 0, 119]] # #550077
COLORS = [BLUEISH, REDISH, PURPLISH]
MARKERS = ["o-", "x-", ".-"]
def plot_graphs(results, file_name, num_rows, num_cols,
x_keys, y_keys, titles, x_labels, y_labels,
label_names, title, tight_plot, verbose):
assert len(results) == num_rows * num_cols
assert len(results) != 1
assert len(x_keys) == len(results)
assert len(y_keys) == len(results)
assert len(titles) == len(results)
assert len(x_labels) == len(results)
assert len(y_labels) == len(results)
# Tight plot (for in a paper or presentation) or regular (for display on a screen)
if tight_plot:
plot_size = 5
w_space = 0.20
h_space = 0.39
title_from_top = 0.11
legend_from_top = 0.17
legend_from_top_per_item = 0.04
x_label_from_bottom = 0.09
legend_spacing = 0.0
font_size = 15
font_size_legend = 13
font_size_title = font_size
bounding_box = "tight"
else:
plot_size = 8
w_space = 0.15
h_space = 0.22
title_from_top = 0.09
legend_from_top = 0.10
legend_from_top_per_item = 0.07
x_label_from_bottom = 0.06
legend_spacing = 0.8
font_size = 15
font_size_legend = font_size
font_size_title = 18
bounding_box = None # means not 'tight'
# Initializes the plot
size_x = plot_size * num_cols
size_y = plot_size * num_rows
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(size_x, size_y), facecolor='w', edgecolor='k')
fig.text(.5, 0.92, title, horizontalalignment="center", fontsize=font_size_title)
plt.subplots_adjust(wspace=w_space, hspace=h_space)
rcParams.update({'font.size': font_size})
# Loops over each subplot
for row in range(num_rows):
for col in range(num_cols):
index = row * num_cols + col
result = results[index]
ax = axes.flat[index]
plt.sca(ax)
print("[plot] Plotting subplot %d" % index)
# Sets the x-axis labels
x_list = [[r[x_key] for r in result] for x_key in x_keys[index]]
x_ticks = [",".join([utils.float_to_kilo_mega(v) for v in values]) for values in zip(*x_list)]
x_location = range(len(x_ticks))
# Optional sparsifying of the labels on the x-axis
if tight_plot and len(x_location) > 10:
x_ticks = [v if not (i % 2) else "" for i, v in enumerate(x_ticks)]
# Sets the y-data
y_list = [[r[y_key] for r in result] for y_key in y_keys[index]]
y_max = max([max(y) for y in y_list])
# Sets the axes
y_rounding = 10 if y_max < 80 else 50 if y_max < 400 else 200
y_axis_limit = (y_max * 1.2) - ((y_max * 1.2) % y_rounding) + y_rounding
plt.ylim(ymin=0, ymax=y_axis_limit)
plt.xticks(x_location, x_ticks, rotation='vertical')
# Sets the labels
ax.set_title(titles[index], y=1.0 - title_from_top, fontsize=font_size)
if col == 0 or y_labels[index] != y_labels[index - 1]:
ax.set_ylabel(y_labels[index])
ax.set_xlabel(x_labels[index])
ax.xaxis.set_label_coords(0.5, x_label_from_bottom)
# Plots the graph
assert len(COLORS) >= len(y_keys[index])
assert len(MARKERS) >= len(y_keys[index])
assert len(label_names) == len(y_keys[index])
for i in range(len(y_keys[index])):
ax.plot(x_location, y_list[i], MARKERS[i], label=label_names[i], color=COLORS[i])
# Sets the legend
leg = ax.legend(loc=(0.02, 1.0 - legend_from_top - legend_from_top_per_item * len(y_keys[index])),
handletextpad=0.1, labelspacing=legend_spacing, fontsize=font_size_legend)
leg.draw_frame(False)
# Saves the plot to disk
print("[benchmark] Saving plot to '" + file_name + "'")
fig.savefig(file_name, bbox_inches=bounding_box)

View File

@ -0,0 +1,381 @@
#!/usr/bin/env python
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import utils
AXPY = {
"label_names": ["CLBlast", "clBLAS"],
"num_rows": 2, "num_cols": 3,
"benchmarks": [
{
"name": "axpy", "num_runs": 40,
"title": "multiples of 256K",
"x_label": "sizes (n)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": utils.k(256), "incx": 1, "incy": 1, "step": utils.k(256), "num_steps": 16}],
},
{
"name": "axpy", "num_runs": 40,
"title": "multiples of 256K+1",
"x_label": "sizes (n)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": utils.k(256) + 1, "incx": 1, "incy": 1, "step": utils.k(256) + 1, "num_steps": 16}],
},
{
"name": "axpy", "num_runs": 40,
"title": "around 1M",
"x_label": "sizes (n)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": utils.m(1), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
},
{
"name": "axpy", "num_runs": 20,
"title": "around 16M",
"x_label": "sizes (n)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": utils.m(16), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
},
{
"name": "axpy", "num_runs": 20,
"title": "strides n=8M",
"x_label": "increments for x,y", "x_keys": ["incx", "incy"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": utils.m(8), "incx": inc_x, "incy": inc_y, "step": 0, "num_steps": 1}
for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
},
{
"name": "axpy", "num_runs": 40,
"title": "powers of 2",
"x_label": "sizes (n)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
for n in utils.powers_of_2(utils.k(32), utils.m(64))],
}
]
}
AXPYBATCHED = {
"label_names": ["CLBlast", "clBLAS (non batched)"],
"num_rows": 1, "num_cols": 3,
"benchmarks": [
{
"name": "axpybatched", "num_runs": 30,
"title": "8 batches",
"x_label": "sizes (n)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"batch_num": 8, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
for n in utils.powers_of_2(utils.k(8), utils.m(4))],
},
{
"name": "axpybatched", "num_runs": 20,
"title": "64 batches",
"x_label": "sizes (n)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"batch_num": 64, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
for n in utils.powers_of_2(utils.k(8), utils.m(4))],
},
{
"name": "axpybatched", "num_runs": 40,
"title": "n=512K",
"x_label": "number of batches", "x_keys": ["batch_num"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"batch_num": b, "n": utils.k(512), "incx": 1, "incy": 1, "step": 1, "num_steps": 1}
for b in utils.powers_of_2(1, 512)],
}
]
}
GEMV = {
"label_names": ["CLBlast", "clBLAS"],
"num_rows": 2, "num_cols": 3,
"benchmarks": [
{
"name": "gemv", "num_runs": 40,
"title": "multiples of 256",
"x_label": "sizes (n=m)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 102, "step": 256, "num_steps": 20}],
},
{
"name": "gemv", "num_runs": 40,
"title": "multiples of 257",
"x_label": "sizes (n=m)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 102, "step": 257, "num_steps": 20}],
},
{
"name": "gemv", "num_runs": 20,
"title": "around 4K",
"x_label": "sizes (n=m)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": 4096, "m": 4096, "incx": 1, "incy": 1, "layout": 102, "step": 1, "num_steps": 16}],
},
{
"name": "gemv", "num_runs": 40,
"title": "multiples of 256 rotated",
"x_label": "sizes (n=m)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 101, "step": 256, "num_steps": 20}],
},
{
"name": "gemv", "num_runs": 40,
"title": "multiples of 257 rotated",
"x_label": "sizes (n=m)", "x_keys": ["n"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 101, "step": 257, "num_steps": 20}],
},
{
"name": "gemv", "num_runs": 20,
"title": "strides n=m=4K",
"x_label": "increments/strides for x,y", "x_keys": ["incx", "incy"],
"y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
"arguments": [{"n": 4096, "m": 4096, "incx": inc_x, "incy": inc_y, "layout": 102, "step": 0, "num_steps": 1}
for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
}
]
}
GEMM = {
"label_names": ["CLBlast", "clBLAS"],
"num_rows": 2, "num_cols": 3,
"benchmarks": [
{
"name": "gemm", "num_runs": 20,
"title": "multiples of 128",
"x_label": "sizes (m=n=k)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
"transA": 111, "transB": 111, "step": 128, "num_steps": 20}],
},
{
"name": "gemm", "num_runs": 20,
"title": "multiples of 129",
"x_label": "sizes (m=n=k)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 129, "n": 129, "k": 129, "layout": 102,
"transA": 111, "transB": 111, "step": 129, "num_steps": 20}],
},
{
"name": "gemm", "num_runs": 20,
"title": "around 512",
"x_label": "sizes (m=n=k)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 512, "n": 512, "k": 512, "layout": 102,
"transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
},
{
"name": "gemm", "num_runs": 10,
"title": "around 2048",
"x_label": "sizes (m=n=k)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 2048, "n": 2048, "k": 2048, "layout": 102,
"transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
},
{
"name": "gemm", "num_runs": 10,
"title": "layouts/transpose",
"x_label": "layout, transA, transB", "x_keys": ["layout", "transA", "transB"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 1024, "n": 1024, "k": 1024, "layout": layout,
"transA": transA, "transB": transB, "step": 0, "num_steps": 1}
for layout in [101, 102] for transA in [111, 112] for transB in [111, 112]],
},
{
"name": "gemm", "num_runs": 10,
"title": "powers of 2",
"x_label": "sizes (m=n=k)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": n, "n": n, "k": n, "layout": 102,
"transA": 111, "transB": 111, "step": 0, "num_steps": 1}
for n in utils.powers_of_2(8, utils.k(4))],
}
]
}
GEMM_SMALL = {
"label_names": ["CLBlast", "clBLAS"],
"num_rows": 2, "num_cols": 1,
"benchmarks": [
{
"name": "gemm", "num_runs": 10,
"title": "small matrices in steps of 16",
"x_label": "sizes (m=n=k)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
"transA": 111, "transB": 111, "step": 16, "num_steps": 57}],
},
{
"name": "gemm", "num_runs": 10,
"title": "small matrices in steps of 1",
"x_label": "sizes (m=n=k)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
"transA": 111, "transB": 111, "step": 1, "num_steps": 385}],
},
]
}
GEMMBATCHED = {
"label_names": ["CLBlast", "clBLAS (non batched)"],
"num_rows": 1, "num_cols": 3,
"benchmarks": [
{
"name": "gemmbatched", "num_runs": 40,
"title": "8 batches",
"x_label": "sizes (m=n=k)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"batch_num": 8, "m": 32, "n": 32, "k": 32, "layout": 102,
"transA": 111, "transB": 111, "step": 32, "num_steps": 20}],
},
{
"name": "gemmbatched", "num_runs": 20,
"title": "64 batches",
"x_label": "sizes (m=n=k)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"batch_num": 64, "m": 32, "n": 32, "k": 32, "layout": 102,
"transA": 111, "transB": 111, "step": 32, "num_steps": 20}],
},
{
"name": "gemmbatched", "num_runs": 30,
"title": "m=n=k=128",
"x_label": "number of batches", "x_keys": ["batch_num"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"batch_num": b, "m": 128, "n": 128, "k": 128, "layout": 102,
"transA": 111, "transB": 111} for b in utils.powers_of_2(1, utils.k(16))],
}
]
}
SYMM = {
"label_names": ["CLBlast", "clBLAS"],
"num_rows": 2, "num_cols": 3,
"benchmarks": [
{
"name": "symm", "num_runs": 10,
"title": "multiples of 128",
"x_label": "sizes (m=n)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 128, "n": 128, "layout": 102,
"side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
},
{
"name": "symm", "num_runs": 10,
"title": "multiples of 129",
"x_label": "sizes (m=n)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 129, "n": 129, "layout": 102,
"side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
},
{
"name": "symm", "num_runs": 10,
"title": "around 512",
"x_label": "sizes (m=n)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 512, "n": 512, "layout": 102,
"side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
},
{
"name": "symm", "num_runs": 10,
"title": "around 2048",
"x_label": "sizes (m=n)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 2048, "n": 2048, "layout": 102,
"side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
},
{
"name": "symm", "num_runs": 10,
"title": "layouts/sides/triangles",
"x_label": "layout, side, triangle", "x_keys": ["layout", "side", "triangle"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": 1024, "n": 1024, "layout": layout,
"side": side, "triangle": triangle, "step": 0, "num_steps": 1}
for layout in [101, 102] for side in [141, 142] for triangle in [121, 122]],
},
{
"name": "symm", "num_runs": 10,
"title": "powers of 2",
"x_label": "sizes (m=n)", "x_keys": ["m"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"m": n, "n": n, "layout": 102,
"side": 141, "triangle": 121, "step": 0, "num_steps": 1}
for n in utils.powers_of_2(8, utils.k(4))],
}
]
}
SYRK = {
"label_names": ["CLBlast", "clBLAS"],
"num_rows": 2, "num_cols": 3,
"benchmarks": [
{
"name": "syrk", "num_runs": 10,
"title": "multiples of 128",
"x_label": "sizes (n=k)", "x_keys": ["n"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"n": 128, "k": 128, "layout": 102,
"side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
},
{
"name": "syrk", "num_runs": 10,
"title": "multiples of 129",
"x_label": "sizes (n=k)", "x_keys": ["n"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"n": 129, "k": 129, "layout": 102,
"side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
},
{
"name": "syrk", "num_runs": 10,
"title": "around 512",
"x_label": "sizes (n=k)", "x_keys": ["n"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"n": 512, "k": 512, "layout": 102,
"side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
},
{
"name": "syrk", "num_runs": 10,
"title": "around 2048",
"x_label": "sizes (n=k)", "x_keys": ["n"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"n": 2048, "k": 2048, "layout": 102,
"side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
},
{
"name": "syrk", "num_runs": 10,
"title": "layouts/sides/triangles",
"x_label": "layout, triangle, transA", "x_keys": ["layout", "triangle", "transA"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"n": 1024, "k": 1024, "layout": layout,
"triangle": triangle, "transA": transA, "step": 0, "num_steps": 1}
for layout in [101, 102] for triangle in [121, 122] for transA in [111, 112]],
},
{
"name": "syrk", "num_runs": 10,
"title": "powers of 2",
"x_label": "sizes (n=k)", "x_keys": ["n"],
"y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
"arguments": [{"n": n, "k": n, "layout": 102,
"side": 141, "triangle": 121, "step": 0, "num_steps": 1}
for n in utils.powers_of_2(8, utils.k(4))],
}
]
}
SUMMARY = {
"label_names": ["CLBlast", "clBLAS"],
"num_rows": 3, "num_cols": 2,
"benchmarks": [
AXPY["benchmarks"][0],
AXPY["benchmarks"][1],
GEMV["benchmarks"][0],
GEMV["benchmarks"][1],
GEMM["benchmarks"][0],
GEMM["benchmarks"][1],
]
}

View File

@ -0,0 +1,66 @@
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import csv
import subprocess
def k(value):
return value * 1024
def m(value):
return value * 1024 * 1024
def float_to_kilo_mega(value):
if value % 1024 or value <= 1024:
return "%.0f" % value
elif value % (1024 * 1024) or value <= (1024 * 1024):
return "%.0fK" % (value / 1024.0)
else:
return "%.0fM" % (value / (1024.0 * 1024.0))
def powers_of_2(start, stop):
while start <= stop:
yield start
start *= 2
def precision_to_letter(precision):
if precision == 16:
return "H"
elif precision == 32:
return "S"
elif precision == 64:
return "D"
elif precision == 3232:
return "C"
elif precision == 6464:
return "Z"
else:
return "X"
def run_binary(command, arguments):
full_command = command + " " + " ".join(arguments)
print("[benchmark] Calling binary: %s" % str(full_command))
try:
return subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE).stdout.read()
except OSError as e:
print("[benchmark] Error while running the binary, got exception: %s" + str(e))
return False
def parse_results(csv_data):
csv_data = csv_data.split("\n")
results = csv.DictReader(csv_data, delimiter=";", skipinitialspace=True)
results = [r for r in results]
for result in results:
for key in result:
result[key] = float(result[key]) if "." in result[key] else int(result[key])
return results

View File

@ -29,12 +29,62 @@ VENDOR_TRANSLATION_TABLE = {
}
def remove_mismatched_arguments(database):
"""Checks for tuning results with mis-matched entries and removes them according to user preferences"""
kernel_attributes = clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"]
# For Python 2 and 3 compatibility
try:
user_input = raw_input
except NameError:
user_input = input
pass
# Check for mis-matched entries
for kernel_group_name, kernel_group in db.group_by(database["sections"], kernel_attributes):
group_by_arguments = db.group_by(kernel_group, clblast.ARGUMENT_ATTRIBUTES)
if len(group_by_arguments) != 1:
print("[database] WARNING: entries for a single kernel with multiple argument values " + str(kernel_group_name))
print("[database] Either quit now, or remove all but one of the argument combinations below:")
for index, (attribute_group_name, mismatching_entries) in enumerate(group_by_arguments):
print("[database] %d: %s" % (index, attribute_group_name))
for attribute_group_name, mismatching_entries in group_by_arguments:
response = user_input("[database] Remove entries corresponding to %s, [y/n]? " % str(attribute_group_name))
if response == "y":
for entry in mismatching_entries:
database["sections"].remove(entry)
print("[database] Removed %d entry/entries" % len(mismatching_entries))
# Sanity-check: all mis-matched entries should be removed
for kernel_group_name, kernel_group in db.group_by(database["sections"], kernel_attributes):
group_by_arguments = db.group_by(kernel_group, clblast.ARGUMENT_ATTRIBUTES)
if len(group_by_arguments) != 1:
print("[database] ERROR: entries for a single kernel with multiple argument values " + str(kernel_group_name))
assert len(group_by_arguments) == 1
def remove_database_entries(database, remove_if_matches_fields):
assert len(remove_if_matches_fields.keys()) > 0
def remove_this_entry(section):
for key in remove_if_matches_fields.keys():
if section[key] != remove_if_matches_fields[key]:
return False
return True
old_length = len(database["sections"])
database["sections"] = [x for x in database["sections"] if not remove_this_entry(x)]
new_length = len(database["sections"])
print("[database] Removed %d entries from the database" % (old_length - new_length))
def main(argv):
# Parses the command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument("source_folder", help="The folder with JSON files to parse to add to the database")
parser.add_argument("clblast_root", help="Root of the CLBlast sources")
parser.add_argument("-r", "--remove_device", type=str, default=None, help="Removes all entries for a specific device")
parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
cl_args = parser.parse_args(argv)
@ -76,10 +126,19 @@ def main(argv):
new_size = db.length(database)
print("with " + str(new_size - old_size) + " new items") # Newline printed here
# Checks for tuning results with mis-matched entries
remove_mismatched_arguments(database)
# Stores the modified database back to disk
if len(glob.glob(json_files)) >= 1:
io.save_database(database, database_filename)
# Removes database entries before continuing
if cl_args.remove_device is not None:
print("[database] Removing all results for device '%s'" % cl_args.remove_device)
remove_database_entries(database, {"device": cl_args.remove_device})
io.save_database(database, database_filename)
# Retrieves the best performing results
print("[database] Calculating the best results per device/kernel...")
database_best_results = bests.get_best_results(database)

View File

@ -123,7 +123,7 @@ def print_cpp_database(database, output_dir):
devices = sorted(set([s["device"] for s in type_database]))
for device_name in devices:
device_database = [s for s in type_database if s["device"] == device_name]
device_name_quoted = "\"%s\"," % device_name
device_name_quoted = "\"%s\"," % device_name.strip()
device_name_cpp = " { %-50s { " % device_name_quoted
f.write(device_name_cpp)

View File

@ -5,6 +5,9 @@
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import itertools
from operator import itemgetter
import clblast
@ -62,3 +65,14 @@ def combine_result(old_results, new_result):
# No match found: append a new result
old_results.append(new_result)
return old_results
def group_by(database, attributes):
"""Returns an list with the name of the group and the corresponding entries in the database"""
assert len(database) > 0
attributes = [a for a in attributes if a in database[0]]
database.sort(key=itemgetter(*attributes))
result = []
for key, data in itertools.groupby(database, key=itemgetter(*attributes)):
result.append((key, list(data)))
return result

View File

@ -56,5 +56,11 @@ def load_tuning_results(filename):
assert json_data["precision"] == str(result["parameters"]["PRECISION"])
result["parameters"].pop("PRECISION", None)
# Fixes the scalar argument values
for value, replacement in zip(["2.00", "2.00+0.50i"], ["2.000000", "2+0.5i"]):
for field in ["arg_alpha", "arg_beta"]:
if field in json_data.keys() and json_data[field] == value:
json_data[field] = replacement
# All done
return json_data

View File

@ -38,11 +38,14 @@ FILES = [
"/src/clblast_c.cpp",
"/test/wrapper_clblas.hpp",
"/test/wrapper_cblas.hpp",
"/test/wrapper_cublas.hpp",
"/include/clblast_netlib_c.h",
"/src/clblast_netlib_c.cpp",
]
HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32]
FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]
HEADER_LINES = [122, 77, 126, 24, 29, 41, 29, 65, 32]
FOOTER_LINES = [25, 139, 27, 38, 6, 6, 6, 9, 2]
HEADER_LINES_DOC = 0
FOOTER_LINES_DOC = 63
# Different possibilities for requirements
ald_m = "The value of `a_ld` must be at least `m`."
@ -99,65 +102,69 @@ bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft
# Populates a list of routines
ROUTINES = [
[ # Level 1: vector-vector
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []),
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []),
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], [xn,yn], ["cos","sin"],"", "Apply givens plane rotation", "", []),
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [xn,yn,"1"], [], "", "Apply modified givens plane rotation", "", []),
Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [xn,yn], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], [xn], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
Routine(False, True, False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []),
Routine(False, True, False, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []),
Routine(False, True, False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], [xn,yn], ["cos","sin"],"", "Apply givens plane rotation", "", []),
Routine(False, True, False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [xn,yn,"1"], [], "", "Apply modified givens plane rotation", "", []),
Routine(True, True, False, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [xn,yn], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
Routine(True, True, False, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], [xn], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
Routine(True, True, False, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
Routine(True, True, False, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
Routine(True, True, False, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
Routine(True, True, False, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
Routine(True, True, False, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
Routine(True, True, False, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
Routine(True, True, False, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
Routine(True, False, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
Routine(True, True, False, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
Routine(True, False, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
Routine(True, False, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
],
[ # Level 2: matrix-vector
Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []),
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []),
Routine(True, True, False, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
Routine(True, True, False, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
Routine(True, True, False, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
Routine(True, True, False, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
Routine(True, True, False, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, False, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
Routine(True, True, False, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
Routine(True, True, False, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, False, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
Routine(True, True, False, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
Routine(True, True, False, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
Routine(True, True, False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []),
Routine(False, True, False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
Routine(False, True, False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []),
# Level 2: matrix update
Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, False, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
Routine(True, True, False, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
Routine(True, True, False, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
Routine(True, True, False, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
Routine(True, True, False, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, False, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
Routine(True, True, False, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, False, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
Routine(True, True, False, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, False, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
Routine(True, True, False, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
],
[ # Level 3: matrix-matrix
Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "", []),
Routine(True, True, False, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
Routine(True, True, False, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, False, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, False, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
Routine(True, True, False, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
Routine(True, True, False, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, False, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, False, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
Routine(True, True, False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []),
],
[ # Level X: extra routines (not part of BLAS)
Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
# Special routines:
Routine(True, True, False, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
# Batched routines:
Routine(True, True, True, "x", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []),
Routine(True, True, True, "x", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "Batched version of GEMM", "As GEMM, but multiple operations are batched together for better performance.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
]]
@ -188,7 +195,7 @@ def main(argv):
# Re-writes the body of the file
with open(library_root + FILES[i], "w") as f:
body = ""
levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4]
levels = [1, 2, 3] if (i == 4 or i == 5 or i == 6) else [1, 2, 3, 4]
for level in levels:
body += cpp.LEVEL_SEPARATORS[level - 1] + "\n"
for routine in ROUTINES[level - 1]:
@ -205,9 +212,13 @@ def main(argv):
if i == 5:
body += cpp.wrapper_cblas(routine)
if i == 6:
body += cpp.clblast_netlib_c_h(routine)
body += cpp.wrapper_cublas(routine)
if i == 7:
body += cpp.clblast_netlib_c_cc(routine)
if not routine.batched:
body += cpp.clblast_netlib_c_h(routine)
if i == 8:
if not routine.batched:
body += cpp.clblast_netlib_c_cc(routine)
f.write("".join(file_header))
f.write(body)
f.write("".join(file_footer))
@ -217,7 +228,7 @@ def main(argv):
for routine in ROUTINES[level - 1]:
if routine.has_tests:
level_string = cpp.LEVEL_NAMES[level - 1]
routine_suffix = "level" + level_string + "/x" + routine.name + ".cpp"
routine_suffix = "level" + level_string + "/x" + routine.lowercase_name() + ".cpp"
# Correctness tests
filename = library_root + "/test/correctness/routines/" + routine_suffix
@ -233,11 +244,20 @@ def main(argv):
f.write(cpp.performance_test(routine, level_string))
f.write(cpp.FOOTER)
# Outputs the API documentation
# API documentation
filename = cl_args.clblast_root + "/doc/clblast.md"
# Stores the header and the footer of the original documentation file
with open(filename) as f:
original = f.readlines()
file_header = original[:HEADER_LINES_DOC]
file_footer = original[-FOOTER_LINES_DOC:]
# Outputs the API documentation
with open(filename, "w") as f:
# Outputs the header
f.write("".join(file_header))
doc_header = doc.header()
f.write(doc_header)
@ -248,5 +268,8 @@ def main(argv):
doc_routine = doc.generate(routine)
f.write(doc_routine)
# Outputs the footer
f.write("".join(file_footer))
if __name__ == '__main__':
main(sys.argv[1:])

View File

@ -56,6 +56,19 @@ def option_to_cblas(x):
}[x]
def option_to_cublas(x):
"""As above, but for clBLAS data-types"""
return {
'layout': "Layout",
'a_transpose': "cublasOperation_t",
'b_transpose': "cublasOperation_t",
'ab_transpose': "cublasOperation_t",
'side': "cublasSideMode_t",
'triangle': "cublasFillMode_t",
'diagonal': "cublasDiagType_t",
}[x]
def option_to_documentation(x):
"""Translates an option name to a documentation string"""
return {

View File

@ -51,8 +51,10 @@ def clblast_cc(routine):
result += routine.routine_header_cpp(12, "") + " {" + NL
result += " try {" + NL
result += " auto queue_cpp = Queue(*queue);" + NL
result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
result += " routine.Do" + routine.name.capitalize() + "("
result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL
if routine.batched:
result += " " + (NL + " ").join(routine.batched_transform_to_cpp()) + NL
result += " routine.Do" + routine.capitalized_name() + "("
result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
result += ");" + NL
result += " return StatusCode::kSuccess;" + NL
@ -63,7 +65,7 @@ def clblast_cc(routine):
result += "}" + NL
for flavour in routine.flavours:
indent2 = " " * (34 + routine.length() + len(flavour.template))
result += "template StatusCode PUBLIC_API " + routine.name.capitalize() + "<" + flavour.template + ">("
result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">("
result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)])
result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL
return result
@ -84,9 +86,11 @@ def clblast_c_cc(routine):
template = "<" + flavour.template + ">" if routine.no_scalars() else ""
indent = " " * (16 + routine.length() + len(template))
result += routine.routine_header_c(flavour, 27, "") + " {" + NL
if routine.batched:
result += " " + (NL + " ").join(routine.batched_transform_to_complex(flavour)) + NL
result += " try {" + NL
result += " return static_cast<CLBlastStatusCode>(" + NL
result += " clblast::" + routine.name.capitalize() + template + "("
result += " clblast::" + routine.capitalized_name() + template + "("
result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
result += "," + NL + indent + "queue, event)" + NL
result += " );" + NL
@ -286,14 +290,69 @@ def wrapper_cblas(routine):
return result
def wrapper_cublas(routine):
"""The wrapper to the reference cuBLAS routines (for performance/correctness testing)"""
result = ""
if routine.has_tests:
result += NL + "// Forwards the cuBLAS calls for %s" % routine.short_names_tested() + NL
if routine.no_scalars():
result += routine.routine_header_wrapper_cublas(routine.template, True, 23) + ";" + NL
for flavour in routine.flavours:
result += routine.routine_header_wrapper_cublas(flavour, False, 23) + " {" + NL
# There is a version available in cuBLAS
if flavour.precision_name in ["S", "D", "C", "Z"]:
indent = " " * (24 + routine.length())
arguments = routine.arguments_wrapper_cublas(flavour)
# Handles row-major
if routine.has_layout():
result += " if (layout == Layout::kRowMajor) { return CUBLAS_STATUS_NOT_SUPPORTED; }" + NL
# Complex scalars
for scalar in routine.scalars:
if flavour.is_complex(scalar):
cuda_complex = "cuDoubleComplex" if flavour.precision_name == "Z" else "cuComplex"
result += " " + cuda_complex + " " + scalar + "_cuda;" + NL
result += " " + scalar + "_cuda.x = " + scalar + ".real();" + NL
result += " " + scalar + "_cuda.y = " + scalar + ".imag();" + NL
# Calls the cuBLAS routine
result += " auto status = cublas" + flavour.name_cublas() + routine.name + "(handle, "
result += ("," + NL + indent).join([a for a in arguments]) + ");" + NL
result += " cudaDeviceSynchronize();" + NL
result += " return status;"
# There is no cuBLAS available, forward the call to one of the available functions
else: # Half-precision
result += " return CUBLAS_STATUS_NOT_SUPPORTED;"
# indent = " " * (24 + routine.length())
# # Convert to float (note: also integer buffers are stored as half/float)
# for buf in routine.inputs + routine.outputs:
# result += " auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL
# # Call the float routine
# result += " return cublasX" + routine.name + "(handle,"
# result += ("," + NL + indent).join([a for a in routine.arguments_half()]) + ");" + NL
# result += " cudaDeviceSynchronize();" + NL
# result += " return status;"
# # Convert back to half
# for buf in routine.outputs:
# result += " FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL
# result += " return status;"
# Complete
result += NL + "}" + NL
return result
def performance_test(routine, level_string):
"""Generates the body of a performance test for a specific routine"""
result = ""
result += "#include \"test/performance/client.hpp\"" + NL
result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL
result += "// Shortcuts to the clblast namespace" + NL
result += "using float2 = clblast::float2;" + NL
result += "using double2 = clblast::double2;" + NL + NL
result += "#include \"test/routines/level" + level_string + "/x" + routine.lowercase_name() + ".hpp\"" + NL + NL
result += "// Main function (not within the clblast namespace)" + NL
result += "int main(int argc, char *argv[]) {" + NL
result += " const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);" + NL
@ -304,7 +363,7 @@ def performance_test(routine, level_string):
found = False
for flavour in routine.flavours:
if flavour.precision_name == precision:
result += NL + " clblast::RunClient<clblast::TestX" + routine.name + flavour.test_template()
result += NL + " clblast::RunClient<clblast::TestX" + routine.plain_name() + flavour.test_template()
result += ">(argc, argv); break;" + NL
found = True
if not found:
@ -319,17 +378,14 @@ def correctness_test(routine, level_string):
"""Generates the body of a correctness test for a specific routine"""
result = ""
result += "#include \"test/correctness/testblas.hpp\"" + NL
result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL
result += "// Shortcuts to the clblast namespace" + NL
result += "using float2 = clblast::float2;" + NL
result += "using double2 = clblast::double2;" + NL + NL
result += "#include \"test/routines/level" + level_string + "/x" + routine.lowercase_name() + ".hpp\"" + NL + NL
result += "// Main function (not within the clblast namespace)" + NL
result += "int main(int argc, char *argv[]) {" + NL
result += " auto errors = size_t{0};" + NL
not_first = "false"
for flavour in routine.flavours:
result += " errors += clblast::RunTests<clblast::TestX" + routine.name + flavour.test_template()
result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.name.upper() + "\");" + NL
result += " errors += clblast::RunTests<clblast::TestX" + routine.plain_name() + flavour.test_template()
result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.upper_name() + "\");" + NL
not_first = "true"
result += " if (errors > 0) { return 1; } else { return 0; }" + NL
result += "}" + NL

View File

@ -30,17 +30,17 @@ class DataType:
self.beta_cl = scalars[3]
self.buffer_type = buffer_type
def use_alpha(self):
def use_alpha(self, postfix=""):
"""Outputs the name of the data-type (alpha/beta), possibly transforming into the right type"""
if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]:
return self.alpha_cpp + "{alpha.s[0], alpha.s[1]}"
return "alpha"
return self.alpha_cpp + "{alpha" + postfix + ".s[0], alpha" + postfix + ".s[1]}"
return "alpha" + postfix
def use_beta(self):
def use_beta(self, postfix=""):
"""As above, but for beta instead of alpha"""
if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]:
return self.beta_cpp + "{beta.s[0], beta.s[1]}"
return "beta"
return self.beta_cpp + "{beta" + postfix + ".s[0], beta" + postfix + ".s[1]}"
return "beta" + postfix
def use_alpha_opencl(self):
"""As above, but the transformation is in the opposite direction"""
@ -72,9 +72,11 @@ class DataType:
def test_template(self):
"""Returns the template as used in the correctness/performance tests"""
buffer_type = "clblast::" + self.buffer_type if self.is_non_standard() else self.buffer_type
beta_cpp = "clblast::" + self.beta_cpp if self.beta_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2] else self.beta_cpp
if self.buffer_type != self.beta_cpp:
return "<" + self.buffer_type + "," + self.beta_cpp + ">, " + self.buffer_type + ", " + self.beta_cpp
return "<" + self.buffer_type + ">, " + self.buffer_type + ", " + self.beta_cpp
return "<" + buffer_type + "," + self.beta_cpp + ">, " + buffer_type + ", " + beta_cpp
return "<" + buffer_type + ">, " + buffer_type + ", " + beta_cpp
def is_complex(self, scalar):
"""Current scalar is complex"""
@ -85,6 +87,11 @@ class DataType:
"""Current type is of a non-standard type"""
return self.buffer_type in [D_HALF, D_FLOAT2, D_DOUBLE2]
def name_cublas(self):
if "i" in self.name:
return "I" + self.name[1].lower()
return self.name
# Regular data-types
H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF) # half (16)

View File

@ -20,7 +20,7 @@ def generate(routine):
result = ""
# Routine header
result += "x" + routine.name.upper() + ": " + routine.description + NL
result += "x" + routine.upper_name() + ": " + routine.description + NL
result += "-------------" + NL + NL
result += routine.details + NL + NL
@ -36,7 +36,7 @@ def generate(routine):
result += "```" + NL + NL
# Routine arguments
result += "Arguments to " + routine.name.upper() + ":" + NL + NL
result += "Arguments to " + routine.upper_name() + ":" + NL + NL
for argument in routine.arguments_doc():
result += "* " + argument + NL
result += "* `cl_command_queue* queue`: "
@ -47,7 +47,7 @@ def generate(routine):
# Routine requirements
if len(routine.requirements_doc()) > 0:
result += "Requirements for " + routine.name.upper() + ":" + NL + NL
result += "Requirements for " + routine.upper_name() + ":" + NL + NL
for requirement in routine.requirements_doc():
result += "* " + requirement + NL
result += NL

View File

@ -12,11 +12,12 @@ import generator.convert as convert
class Routine:
"""Class holding routine-specific information (e.g. name, which arguments, which precisions)"""
def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
def __init__(self, implemented, has_tests, batched, level, name, template, flavours, sizes, options,
inputs, outputs, buffer_sizes, scalars, scratch,
description, details, requirements):
self.implemented = implemented
self.has_tests = has_tests
self.batched = batched
self.level = level
self.name = name
self.template = template
@ -32,6 +33,69 @@ class Routine:
self.details = details
self.requirements = requirements
def lowercase_name(self):
postfix = "batched" if self.batched else ""
return self.name + postfix
def plain_name(self):
postfix = "Batched" if self.batched else ""
return self.name + postfix
def capitalized_name(self):
postfix = "Batched" if self.batched else ""
return self.name.capitalize() + postfix
def upper_name(self):
postfix = "BATCHED" if self.batched else ""
return self.name.upper() + postfix
def b_star(self):
return "*" if self.batched else ""
def b_s(self):
return "s" if self.batched else ""
def batch_count_def(self):
return ["const size_t batch_count"] if self.batched else []
def batch_count_list(self):
return ["batch_count"] if self.batched else []
def batch_count_type(self):
return ["const size_t"] if self.batched else []
def batch_count_doc(self):
return ["`const size_t batch_count`: Number of batches. This value must be positive."] if self.batched else []
def batched_transform_to_cpp(self):
result = []
for scalar in self.scalars:
result.append("auto " + scalar + "s_cpp = std::vector<T>();")
for buffer_name in self.inputs + self.outputs:
result.append("auto " + buffer_name + "_offsets_cpp = std::vector<size_t>();")
result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {")
for scalar in self.scalars:
result.append(" " + scalar + "s_cpp.push_back(" + scalar + "s[batch]);")
for buffer_name in self.inputs + self.outputs:
result.append(" " + buffer_name + "_offsets_cpp.push_back(" + buffer_name + "_offsets[batch]);")
result.append("}")
return result
def batched_transform_to_complex(self, flavour):
result = []
for scalar in self.scalars:
result.append("auto " + scalar + "s_cpp = std::vector<" + flavour.buffer_type + ">();")
result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {")
for scalar in self.scalars:
content = scalar
if scalar == "alpha":
content = flavour.use_alpha(postfix="s[batch]")
elif scalar == "beta":
content = flavour.use_beta(postfix="s[batch]")
result.append(" " + scalar + "s_cpp.push_back(" + content + ");")
result.append("}")
return result
@staticmethod
def scalar_buffers_first():
"""List of scalar buffers"""
@ -127,21 +191,25 @@ class Routine:
def length(self):
"""Retrieves the number of characters in the routine's name"""
return len(self.name)
return len(self.capitalized_name())
def no_scalars(self):
"""Determines whether or not this routine has scalar arguments (alpha/beta)"""
return self.scalars == []
def has_layout(self):
"""Determines whether the layout is an argument"""
return "layout" in self.options
def short_names(self):
"""Returns the upper-case names of these routines (all flavours)"""
return "/".join([f.name + self.name.upper() for f in self.flavours])
return "/".join([f.name + self.upper_name() for f in self.flavours])
def short_names_tested(self):
"""As above, but excludes some"""
names = [f.name + self.name.upper() for f in self.flavours]
if "H" + self.name.upper() in names:
names.remove("H" + self.name.upper())
names = [f.name + self.upper_name() for f in self.flavours]
if "H" + self.upper_name() in names:
names.remove("H" + self.upper_name())
return "/".join(names)
def buffers_first(self):
@ -159,7 +227,7 @@ class Routine:
"""Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
if name in self.inputs or name in self.outputs:
a = [name + "_buffer"]
b = [name + "_offset"]
b = [name + "_offset" + self.b_s()]
c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
return [", ".join(a + b + c)]
return []
@ -187,13 +255,13 @@ class Routine:
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
a = [prefix + "cl_mem " + name + "_buffer"]
b = ["const size_t " + name + "_offset"]
b = ["const size_t " + self.b_star() + name + "_offset" + self.b_s()]
c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + b + c)]
return []
def buffer_def_wrapper_cl(self, name, flavour):
"""As above but with data-types"""
"""As above but for OpenCL"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"]
@ -202,6 +270,16 @@ class Routine:
return [", ".join(a + b + c)]
return []
def buffer_def_wrapper_cuda(self, name, flavour):
"""As above but for CUDA"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
a = [prefix + flavour.buffer_type + "* " + name + "_buffer"]
b = ["const size_t " + name + "_offset"]
c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + b + c)]
return []
def buffer_def_vector(self, name, flavour):
"""As above but as vectors"""
prefix = "const " if name in self.inputs else ""
@ -228,7 +306,7 @@ class Routine:
if name in self.inputs or name in self.outputs:
buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type
a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"]
b = [name + "_offset"]
b = [name + "_offsets_cpp"] if self.batched else [name + "_offset"]
c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
return [", ".join(a + b + c)]
return []
@ -265,12 +343,38 @@ class Routine:
return [", ".join(a + c)]
return []
def buffer_wrapper_cublas(self, name, flavour):
"""As above but for cuBLAS the wrapper"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
if name in self.index_buffers():
a = ["reinterpret_cast<int*>(&" + name + "_buffer[" + name + "_offset])"]
elif name in self.outputs and flavour.name in ["Sc", "Dz"]:
dtype = "float" if flavour.name == "Sc" else "double"
a = ["reinterpret_cast<" + dtype + "*>(&" + name + "_buffer[" + name + "_offset])"]
elif flavour.precision_name in ["C", "Z"]:
cuda_complex = "cuDoubleComplex" if flavour.precision_name == "Z" else "cuComplex"
a = ["reinterpret_cast<" + prefix + cuda_complex + "*>" +
"(&" + name + "_buffer[" + name + "_offset])"]
else:
a = ["&" + name + "_buffer[" + name + "_offset]"]
c = []
if name in ["x", "y"]:
c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"]
elif name in ["a", "b", "c"]:
c = [name + "_" + self.postfix(name)]
result = [", ".join(a + c)]
if self.name == "trmm" and name == "a":
result *= 2
return result
return []
def buffer_type(self, name):
"""As above, but only data-types"""
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix + "cl_mem"]
b = ["const size_t"]
b = ["const size_t" + self.b_star()]
c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else []
return [", ".join(a + b + c)]
return []
@ -283,18 +387,19 @@ class Routine:
math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector"
inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment "
a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."]
b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."]
b = ["`const size_t " + self.b_star() + name + "_offset" + self.b_s() + "`: The offset" + self.b_s() + " in elements from the start of the " + inout + " " + math_name + "."]
c = []
if name not in self.buffers_without_ld_inc():
c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " +
inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."]
else:
c = []
return a + b + c
return []
def scalar(self, name):
"""Retrieves the name of a scalar (alpha/beta)"""
if name in self.scalars:
if self.batched:
return [name + "s_cpp"]
return [name]
return []
@ -314,8 +419,12 @@ class Routine:
"""Retrieves the use of a scalar (alpha/beta)"""
if name in self.scalars:
if name == "alpha":
if self.batched:
return ["alphas_cpp.data()"]
return [flavour.use_alpha()]
elif name == "beta":
if self.batched:
return ["betas_cpp.data()"]
return [flavour.use_beta()]
return [name]
return []
@ -338,20 +447,28 @@ class Routine:
return [name]
return []
def scalar_use_wrapper_cublas(self, name, flavour):
"""As above, but for the cuBLAS wrapper"""
if name in self.scalars:
if flavour.is_complex(name):
return ["&" + name + "_cuda"]
return ["&" + name]
return []
def scalar_def(self, name, flavour):
"""Retrieves the definition of a scalar (alpha/beta)"""
if name in self.scalars:
if name == "alpha":
return ["const " + flavour.alpha_cl + " " + name]
return ["const " + flavour.beta_cl + " " + name]
return ["const " + flavour.alpha_cl + " " + self.b_star() + name + self.b_s()]
return ["const " + flavour.beta_cl + " " + self.b_star() + name + self.b_s()]
return []
def scalar_def_plain(self, name, flavour):
"""As above, but without 'cl_' prefix"""
if name in self.scalars:
if name == "alpha":
return ["const " + flavour.alpha_cpp + " " + name]
return ["const " + flavour.beta_cpp + " " + name]
return ["const " + flavour.alpha_cpp + " " + self.b_star() + name + self.b_s()]
return ["const " + flavour.beta_cpp + " " + self.b_star() + name + self.b_s()]
return []
def scalar_def_void(self, name, flavour):
@ -368,16 +485,16 @@ class Routine:
"""Retrieves the type of a scalar (alpha/beta)"""
if name in self.scalars:
if name == "alpha":
return ["const " + flavour.alpha_cpp]
return ["const " + flavour.beta_cpp]
return ["const " + flavour.alpha_cpp + self.b_star()]
return ["const " + flavour.beta_cpp + self.b_star()]
return []
def scalar_doc(self, name):
"""Retrieves the documentation of a scalar"""
if name in self.scalars:
if name == "alpha":
return ["`const " + self.template.alpha_cpp + " " + name + "`: Input scalar constant."]
return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."]
return ["`const " + self.template.alpha_cpp + " " + self.b_star() + name + self.b_s() + "`: Input scalar constant" + self.b_s() + "."]
return ["`const " + self.template.beta_cpp + " " + self.b_star() + name + self.b_s() + "`: Input scalar constant" + self.b_s() + "."]
return []
def scalar_create_cpp(self, flavour):
@ -396,6 +513,12 @@ class Routine:
return [", ".join([s for s in self.sizes])]
return []
def sizes_list_as_int(self):
"""Retrieves a list of comma-separated sizes (m, n, k) cast to integers"""
if self.sizes:
return [", ".join(["static_cast<int>(" + s + ")" for s in self.sizes])]
return []
def sizes_def(self):
"""Retrieves the definition of the sizes (m,n,k)"""
if self.sizes:
@ -427,6 +550,15 @@ class Routine:
return [", ".join(self.options)]
return []
def options_list_no_layout(self):
"""Retrieves a list of options"""
options = self.options[:]
if "layout" in options:
options.remove("layout")
if options:
return [", ".join(options)]
return []
def options_cast(self, indent):
"""As above, but now casted to CLBlast data-types"""
if self.options:
@ -462,6 +594,13 @@ class Routine:
return [", ".join(definitions)]
return []
def options_def_wrapper_cublas(self):
"""As above, but now using cuBLAS data-types"""
if self.options:
definitions = ["const " + convert.option_to_cublas(o) + " " + o for o in self.options]
return [", ".join(definitions)]
return []
def options_type(self):
"""Retrieves the types of the options (layout, transpose, side, etc.)"""
if self.options:
@ -507,7 +646,8 @@ class Routine:
self.scalar("beta") +
list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar(s) for s in self.other_scalars()])))
list(chain(*[self.scalar(s) for s in self.other_scalars()])) +
self.batch_count_list())
def arguments_cast(self, flavour, indent):
"""As above, but with CLBlast casts"""
@ -518,7 +658,8 @@ class Routine:
self.scalar_use("beta", flavour) +
list(chain(*[self.buffer(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])))
list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])) +
self.batch_count_list())
def arguments_netlib(self, flavour, indent):
"""As above, but for the Netlib CBLAS API"""
@ -544,7 +685,7 @@ class Routine:
def arguments_wrapper_cblas(self, flavour):
"""As above, but for the CBLAS wrapper"""
return (self.options_list() + self.sizes_list() +
return (self.options_list() + self.sizes_list_as_int() +
self.scalar_use_wrapper_cblas("alpha", flavour) +
list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_first()])) +
self.scalar_use_wrapper_cblas("beta", flavour) +
@ -552,6 +693,17 @@ class Routine:
list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_use_wrapper_cblas(s, flavour) for s in self.other_scalars()])))
def arguments_wrapper_cublas(self, flavour):
"""As above, but for the cuBLAS wrapper"""
return (self.options_list_no_layout() + self.sizes_list_as_int() +
self.scalar_use_wrapper_cublas("alpha", flavour) +
list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.buffers_first()])) +
self.scalar_use_wrapper_cublas("beta", flavour) +
list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.buffers_second()])) +
list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.scalar_buffers_first()])) +
list(chain(*[self.buffer_wrapper_cublas(b, flavour) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_use_wrapper_cublas(s, flavour) for s in self.other_scalars()])))
def arguments_def(self, flavour):
"""Retrieves a combination of all the argument definitions"""
return (self.options_def() + self.sizes_def() +
@ -561,7 +713,8 @@ class Routine:
self.scalar_def("beta", flavour) +
list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])) +
self.batch_count_def())
def arguments_def_netlib(self, flavour):
"""As above, but for the Netlib CBLAS API"""
@ -574,6 +727,7 @@ class Routine:
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
if self.name in self.routines_scalar_no_return():
result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()]))
result += self.batch_count_def()
return result
def arguments_def_c(self, flavour):
@ -585,7 +739,8 @@ class Routine:
self.scalar_def("beta", flavour) +
list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])) +
self.batch_count_def())
def arguments_def_wrapper_clblas(self, flavour):
"""As above, but clBLAS wrapper plain data-types"""
@ -609,6 +764,17 @@ class Routine:
list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()])))
def arguments_def_wrapper_cublas(self, flavour):
"""As above, but cuBLAS wrapper plain data-types"""
return (self.options_def_wrapper_cublas() + self.sizes_def() +
list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.scalar_buffers_first()])) +
self.scalar_def_plain("alpha", flavour) +
list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.buffers_first()])) +
self.scalar_def_plain("beta", flavour) +
list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.buffers_second()])) +
list(chain(*[self.buffer_def_wrapper_cuda(b, flavour) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()])))
def arguments_type(self, flavour):
"""Retrieves a combination of all the argument types"""
return (self.options_type() + self.sizes_type() +
@ -618,7 +784,8 @@ class Routine:
self.scalar_type("beta", flavour) +
list(chain(*[self.buffer_type(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()])))
list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()])) +
self.batch_count_type())
def arguments_doc(self):
"""Retrieves a combination of all the argument types"""
@ -630,7 +797,8 @@ class Routine:
self.scalar_doc("beta") +
list(chain(*[self.buffer_doc(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_doc(s) for s in self.other_scalars()])))
list(chain(*[self.scalar_doc(s) for s in self.other_scalars()])) +
self.batch_count_doc())
def requirements_doc(self):
"""Retrieves a list of routine requirements for documentation"""
@ -640,7 +808,7 @@ class Routine:
"""Retrieves the C++ templated definition for a routine"""
indent = " " * (spaces + self.length())
result = "template <" + self.template.name + ">\n"
result += "StatusCode " + self.name.capitalize() + "("
result += "StatusCode " + self.capitalized_name() + "("
result += (",\n" + indent).join([a for a in self.arguments_def(self.template)])
result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")"
return result
@ -649,7 +817,7 @@ class Routine:
"""As above, but now without variable names"""
indent = " " * (spaces + self.length())
result = "template <" + self.template.name + ">\n"
result += "StatusCode " + self.name.capitalize() + "("
result += "StatusCode " + self.capitalized_name() + "("
result += (",\n" + indent).join([a for a in self.arguments_type(self.template)])
result += ",\n" + indent + "cl_command_queue*, cl_event*)"
return result
@ -657,7 +825,7 @@ class Routine:
def routine_header_c(self, flavour, spaces, extra_qualifier):
"""As above, but now for C"""
indent = " " * (spaces + self.length())
result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.plain_name() + "("
result += (",\n" + indent).join([a for a in self.arguments_def_c(flavour)])
result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
return result
@ -677,6 +845,8 @@ class Routine:
if self.name in self.routines_scalar_no_return():
routine_name += "_sub"
indent += " "
if self.batched:
routine_name += "batched"
result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "("
result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")"
return result
@ -703,3 +873,17 @@ class Routine:
result = "void cblasX" + self.name + "("
result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cblas(flavour)]) + ")"
return result
def routine_header_wrapper_cublas(self, flavour, def_only, spaces):
"""As above, but now for the cuBLAS wrapper"""
template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else ""
indent = " " * (spaces + self.length() + len(template))
result = ""
if self.no_scalars():
result += "template <"
if def_only:
result += flavour.name
result += ">\n"
result += "cublasStatus_t cublasX" + self.name + template + "(cublasHandle_t handle, "
result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cublas(flavour)]) + ")"
return result

View File

@ -1,262 +0,0 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the common performance scripts, such as creating a graph.
#
# ==================================================================================================
# Colours
black = "#000000"
grey = "#888888"
purplish = "#550077" # [ 85, 0,119] lumi=26
blueish = "#4765b1" # [ 71,101,177] lumi=100
redish = "#d67568" # [214,117,104] lumi=136
greenish = "#9bd4ca" # [155,212,202] lumi=199
# Sets the graph markers (circles, triangles, etc.)
pchs = c(15, 18, 17, 12)
# Other constants
kilo = 1024
mega = 1024*1024
# R options
options("width"=170)
# ==================================================================================================
# Settings
num_runs <- 5
num_runs_short <- 50
xtics_subset_threshold <- 100
xtics_subset_stepsize <- 8
devices <- c("-platform","-device")
options_string <- "-q -no_abbrv -cblas 0"
# Command-line arguments
command_line <- commandArgs(trailingOnly=TRUE)
if (length(command_line) != 2) {
print("Usage for device Z on platform Y: Rscript xxxxx.r Y Z")
quit()
}
platform_id <- command_line[1]
device_id <- command_line[2]
# Selects the device
devices_values <- c(platform_id, device_id)
devices_string <- paste(devices, devices_values, collapse=" ")
# Filter the string: only lines containing a ";" can be valid lines
filter_string <- function(raw_result_string) {
result_string <- c()
for (line in raw_result_string) {
if (grepl(";",line)) {
result_string <-
c(result_string, line)
}
}
return(result_string)
}
# ==================================================================================================
# The main function
main <- function(routine_name, precision, test_names, test_values,
test_xlabels, test_xaxis, metric_gflops) {
# Names
display_name <- toupper(routine_name)
if (precision == 16) { display_name <- gsub("^X","H",display_name); }
if (precision == 32) { display_name <- gsub("^X","S",display_name); }
if (precision == 64) { display_name <- gsub("^X","D",display_name); }
if (precision == 3232) { display_name <- gsub("^X","C",display_name); }
if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
executable <- paste("./clblast_client_", routine_name, sep="")
# Display
library_names <- c("CLBlast", "clBLAS")
if (precision == 16) { library_names <- c("CLBlast FP16", "CLBlast FP32", "clBLAS FP32"); }
colourset <- c(blueish, redish)
if (precision == 16) { colourset <- c(blueish, purplish, redish); }
# Configures the outputfile
file_name <- paste(display_name, ".pdf", sep="")
if (length(test_names) == 6) {
pdf(file_name, height=8, width=13)
par(mfrow=c(2, 3))
par(oma=c(0, 0, 0, 0))
par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
}
else { # length(test_names) == 2
pdf(file_name, height=8, width=13)
par(mfrow=c(2, 1))
par(oma=c(0, 0, 0, 0))
par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
}
# Loops over the test-cases
for (test_id in 1:length(test_names)) {
params_values <- test_values[[test_id]]
# Loops over the commands within a single list (within a case)
for (command_id in 1:length(params_values)) {
# Runs the client and captures the result
params_string <- paste(parameters, params_values[[command_id]], collapse=" ")
arguments <- paste(devices_string, params_string, options_string, sep=" ")
print(paste("Running", executable, arguments, sep=" "))
raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
result_string <- filter_string(raw_result_string)
# Reads the result into a dataframe
command_db <- read.csv(text=result_string, sep=";")
# For half-precision: also runs the FP32 version for comparison
if (precision == 16) {
params_string <- gsub("-precision 16", "-precision 32", params_string)
arguments <- paste(devices_string, params_string, options_string, sep=" ")
print(paste("Running", executable, arguments, sep=" "))
raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
result_string <- filter_string(raw_result_string)
# Reads the result into a dataframe
command_db_32 <- read.csv(text=result_string, sep=";")
stopifnot(nrow(command_db) == nrow(command_db_32))
# Combines the results
command_db["ms_FP32_1"] = command_db_32$ms_1
command_db["GFLOPS_FP32_1"] = command_db_32$GFLOPS_1
command_db["GBs_FP32_1"] = command_db_32$GBs_1
command_db["ms_FP32_2"] = command_db_32$ms_2
command_db["GFLOPS_FP32_2"] = command_db_32$GFLOPS_2
command_db["GBs_FP32_2"] = command_db_32$GBs_2
}
# Append the results to the final dataframe
if (command_id == 1) {
db <- command_db
} else {
db <- rbind(db, command_db)
}
}
print(db)
# Sets the values on the x-axis and their labels (test dependent)
if (is.character(test_xaxis[[test_id]][[1]])) {
xdata <- db[,test_xaxis[[test_id]][[1]]]
xtics <- xdata
log_scale <- test_xaxis[[test_id]][[2]]
}
else {
xdata <- test_xaxis[[test_id]][[1]]
xtics <- test_xaxis[[test_id]][[2]]
log_scale <- ""
}
# Plots the graph with GFLOPS on the Y-axis
if (metric_gflops) {
if (precision == 16) {
ydata = list(db$GFLOPS_1, db$GFLOPS_FP32_1, db$GFLOPS_FP32_2)
ymax = max(max(db$GFLOPS_1), max(db$GFLOPS_FP32_1), max(db$GFLOPS_FP32_2))
} else {
ydata = list(db$GFLOPS_1, db$GFLOPS_2)
ymax = max(max(db$GFLOPS_1), max(db$GFLOPS_2))
}
plot_graph(xdata=xdata, ydata=ydata, log_setting=log_scale,
xmin=min(xdata), xmax=max(xdata),
ymin=0, ymax=ymax,
xtics=xtics,
xlabel=test_xlabels[[test_id]], ylabel="GFLOPS (higher is better)",
graph_title=paste(display_name, test_names[[test_id]], sep=" "),
multiple=50, experiment_names=library_names, colourset=colourset)
# Plots the graph with GB/s on the Y-axis
} else {
if (precision == 16) {
ydata = list(db$GBs_1, db$GBs_FP32_1, db$GBs_FP32_2)
ymax = max(max(db$GBs_1), max(db$GBs_FP32_1), max(db$GBs_FP32_2))
} else {
ydata = list(db$GBs_1, db$GBs_2)
ymax = max(max(db$GBs_1), max(db$GBs_2))
}
plot_graph(xdata=xdata, ydata=ydata, log_setting=log_scale,
xmin=min(xdata), xmax=max(xdata),
ymin=0, ymax=ymax,
xtics=xtics,
xlabel=test_xlabels[[test_id]], ylabel="GB/s (higher is better)",
graph_title=paste(display_name, test_names[[test_id]], sep=" "),
multiple=10, experiment_names=library_names, colourset=colourset)
}
}
}
# ==================================================================================================
# Plots data
plot_graph <- function(xdata, ydata, log_setting,
xmin, xmax, ymin, ymax,
xtics, xlabel, ylabel,
graph_title,
multiple, experiment_names, colourset) {
# Update the ymax to the next multiple of something
ymax <- multiple*ceiling(ymax/multiple)
# Add kilo or mega to the x-labels
for (i in 1:length(xtics)) {
if (!is.na(as.numeric(xtics[i]))) {
if (as.numeric(xtics[i])%%mega == 0) {
xtics[i] <- paste(as.character(as.numeric(xtics[i])/mega), "M", sep="")
} else if (as.numeric(xtics[i])%%kilo == 0) {
xtics[i] <- paste(as.character(as.numeric(xtics[i])/kilo), "K", sep="")
}
}
}
# Creates an initial graph with axis but without data
par(new=F)
plot(x=xmin:xmax, y=rep(1, length(xmin:xmax)), log=log_setting,
main="", xlab="", ylab="",
ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n")
axis(side=2, las=2)
if (length(xdata) > xtics_subset_threshold) { # Too many indices to print, plot only every Nth
subset <- seq(from=1, to=length(xdata), by=xtics_subset_stepsize)
axis(side=1, at=xdata[subset], labels=xtics[subset], las=2)
} else {
axis(side=1, at=xdata, labels=xtics, las=2)
}
title(xlab=xlabel, line=-1)
title(ylab=ylabel, line=2)
title(graph_title, line=-2)
par(new=T)
# Loops over all experiments
num_experiments <- length(ydata)
for (id in 1:num_experiments) {
# Plots the data for this experiment
plot(x=xdata, y=ydata[[id]], log=log_setting,
col=colourset[id], pch=pchs[id], lty=1, lwd=1, cex=1,
xlab="", ylab="", ylim=c(ymin, ymax), xlim=c(xmin, xmax),
axes=F, "b", xpd=T)
par(new=T)
}
# Add a legend
legend("bottomright", experiment_names,
lwd=1, ncol=1, col=colourset, pch=pchs, lty=1, cex=1,
bty="n", xpd=T)
# Done
par(new=F)
}
# ==================================================================================================

View File

@ -1,96 +0,0 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xaxpy routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xaxpy"
parameters <- c("-n","-incx","-incy",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 256K",
"multiples of 256K (+1)",
"around n=1M",
"around n=16M",
"strides (n=8M)",
"powers of 2"
)
# Defines the test-cases
test_values <- list(
list(c(256*kilo, 1, 1, 16, 256*kilo, num_runs, precision)),
list(c(256*kilo+1, 1, 1, 16, 256*kilo, num_runs, precision)),
list(c(1*mega, 1, 1, 16, 1, num_runs, precision)),
list(c(16*mega, 1, 1, 16, 1, num_runs, precision)),
list(
c(8*mega, 1, 1, 1, 0, num_runs, precision),
c(8*mega, 2, 1, 1, 0, num_runs, precision),
c(8*mega, 4, 1, 1, 0, num_runs, precision),
c(8*mega, 8, 1, 1, 0, num_runs, precision),
c(8*mega, 1, 2, 1, 0, num_runs, precision),
c(8*mega, 1, 4, 1, 0, num_runs, precision),
c(8*mega, 1, 8, 1, 0, num_runs, precision),
c(8*mega, 2, 2, 1, 0, num_runs, precision),
c(8*mega, 4, 4, 1, 0, num_runs, precision),
c(8*mega, 8, 8, 1, 0, num_runs, precision)
),
list(
c(32*kilo, 1, 1, 1, 0, num_runs, precision),
c(64*kilo, 1, 1, 1, 0, num_runs, precision),
c(128*kilo, 1, 1, 1, 0, num_runs, precision),
c(256*kilo, 1, 1, 1, 0, num_runs, precision),
c(512*kilo, 1, 1, 1, 0, num_runs, precision),
c(1*mega, 1, 1, 1, 0, num_runs, precision),
c(2*mega, 1, 1, 1, 0, num_runs, precision),
c(4*mega, 1, 1, 1, 0, num_runs, precision),
c(8*mega, 1, 1, 1, 0, num_runs, precision),
c(16*mega, 1, 1, 1, 0, num_runs, precision),
c(32*mega, 1, 1, 1, 0, num_runs, precision),
c(64*mega, 1, 1, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"vector sizes (n)",
"vector sizes (n)",
"vector sizes (n)",
"vector sizes (n)",
"increments/strides for x and y",
"vector sizes (n)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("n", ""),
c("n", ""),
c("n", ""),
c("n", ""),
list(1:10, c("x1y1", "x2y1", "x4y1", "x8y1", "x1y2", "x1y4", "x1y8", "x2y2", "x4y4", "x8y8")),
c("n", "x")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=FALSE)
# ==================================================================================================

View File

@ -1,94 +0,0 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xgemm routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xgemm"
parameters <- c("-m","-n","-k","-layout","-transA","-transB",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 128",
"multiples of 128 (+1)",
"around m=n=k=512",
"around m=n=k=2048",
"layouts and transposing (m=n=k=1024)",
"powers of 2"
)
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 128, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 129, 129, 129, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 512, 512, 512, 102, 111, 111, 16, 1, num_runs, precision)),
list(c(2048, 2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 8, 102, 111, 111, 1, 0, num_runs, precision),
c( 16, 16, 16, 102, 111, 111, 1, 0, num_runs, precision),
c( 32, 32, 32, 102, 111, 111, 1, 0, num_runs, precision),
c( 64, 64, 64, 102, 111, 111, 1, 0, num_runs, precision),
c( 128, 128, 128, 102, 111, 111, 1, 0, num_runs, precision),
c( 256, 256, 256, 102, 111, 111, 1, 0, num_runs, precision),
c( 512, 512, 512, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(2048, 2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
c(4096, 4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
c(8192, 8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"matrix sizes (m=n=k)",
"matrix sizes (m=n=k)",
"matrix sizes (m=n=k)",
"matrix sizes (m=n=k)",
"layout (row/col), transA (n/y), transB (n/y)",
"matrix sizes (m=n=k)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("m", ""),
c("m", ""),
c("m", ""),
c("m", ""),
list(1:8, c("row,n,n", "row,n,y", "row,y,n", "row,y,y",
"col,n,n", "col,n,y", "col,y,n", "col,y,y")),
c("m", "x")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
# ==================================================================================================

View File

@ -1,56 +0,0 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for small sizes of Xgemm, testing the direct kernel
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xgemm"
parameters <- c("-m","-n","-k","-layout","-transA","-transB",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"small matrices in steps of 16",
"small matrices in steps of 1"
)
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 128, 102, 111, 111, 57, 16, num_runs_short, precision)),
list(c( 128, 128, 128, 102, 111, 111, 385, 1, num_runs_short, precision))
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"matrix sizes (m=n=k)",
"matrix sizes (m=n=k)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("m", ""),
c("m", "")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
# ==================================================================================================

View File

@ -1,83 +0,0 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xgemv routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xgemv"
parameters <- c("-n","-m","-incx","-incy","-layout",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 256",
"multiples of 256 (+1)",
"around n=m=2K",
"multiples of 256 [rotated]",
"multiples of 256 (+1) [rotated]",
"strides (n=2K)"
)
# Defines the test-cases
test_values <- list(
list(c(256, 256, 1, 1, 102, 16, 256, num_runs, precision)),
list(c(256+1, 256+1, 1, 1, 102, 16, 256, num_runs, precision)),
list(c(2*kilo, 2*kilo, 1, 1, 102, 16, 1, num_runs, precision)),
list(c(256, 256, 1, 1, 101, 16, 256, num_runs, precision)),
list(c(256+1, 256+1, 1, 1, 101, 16, 256, num_runs, precision)),
list(
c(2*kilo, 2*kilo, 1, 1, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 2, 1, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 4, 1, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 8, 1, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 2, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 4, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 1, 8, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 2, 2, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 4, 4, 102, 1, 0, num_runs, precision),
c(2*kilo, 2*kilo, 8, 8, 102, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"vector sizes (n)",
"vector sizes (n)",
"vector sizes (n)",
"vector sizes (n)",
"vector sizes (n)",
"increments/strides for x and y"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("n", ""),
c("n", ""),
c("n", ""),
c("n", ""),
c("n", ""),
list(1:10, c("x1y1", "x2y1", "x4y1", "x8y1", "x1y2", "x1y4", "x1y8", "x2y2", "x4y4", "x8y8"))
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=FALSE)
# ==================================================================================================

View File

@ -1,94 +0,0 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xsymm routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xsymm"
parameters <- c("-m","-n","-layout","-side","-triangle",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 128",
"multiples of 128 (+1)",
"around m=n=512",
"around m=n=2048",
"layouts and side/triangle (m=n=1024)",
"powers of 2"
)
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 102, 141, 121, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 141, 121, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 141, 121, 16, 1, num_runs, precision)),
list(c(2048, 2048, 102, 141, 121, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 101, 141, 121, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 122, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 121, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 122, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 122, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 121, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 122, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 102, 141, 121, 1, 0, num_runs, precision),
c( 16, 16, 102, 141, 121, 1, 0, num_runs, precision),
c( 32, 32, 102, 141, 121, 1, 0, num_runs, precision),
c( 64, 64, 102, 141, 121, 1, 0, num_runs, precision),
c( 128, 128, 102, 141, 121, 1, 0, num_runs, precision),
c( 256, 256, 102, 141, 121, 1, 0, num_runs, precision),
c( 512, 512, 102, 141, 121, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
c(2048, 2048, 102, 141, 121, 1, 0, num_runs, precision),
c(4096, 4096, 102, 141, 121, 1, 0, num_runs, precision),
c(8192, 8192, 102, 141, 121, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"matrix sizes (m=n)",
"matrix sizes (m=n)",
"matrix sizes (m=n)",
"matrix sizes (m=n)",
"layout (row/col), side (l/r), triangle (up/lo)",
"matrix sizes (m=n)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("m", ""),
c("m", ""),
c("m", ""),
c("m", ""),
list(1:8, c("row,l,up", "row,r,up", "row,l,lo", "row,r,lo",
"col,l,up", "col,r,up", "col,l,lo", "col,r,lo")),
c("m", "x")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
# ==================================================================================================

View File

@ -1,94 +0,0 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xsyr2k routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xsyr2k"
parameters <- c("-n","-k","-layout","-triangle","-transA",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 128",
"multiples of 128 (+1)",
"around n=k=512",
"around n=k=1536",
"layouts and transposing (n=k=1024)",
"powers of 2"
)
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)),
list(c(1536, 1536, 102, 111, 111, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision),
c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision),
c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision),
c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision),
c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision),
c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision),
c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"layout (row/col), triangle (u/l), transA (n/y)",
"matrix sizes (n=k)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("n", ""),
c("n", ""),
c("n", ""),
c("n", ""),
list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
"col,u,n", "col,u,y", "col,l,n", "col,l,y")),
c("n", "x")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
# ==================================================================================================

View File

@ -1,94 +0,0 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xsyrk routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xsyrk"
parameters <- c("-n","-k","-layout","-triangle","-transA",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 128",
"multiples of 128 (+1)",
"around n=k=512",
"around n=k=2048",
"layouts and transposing (n=k=1024)",
"powers of 2"
)
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 102, 121, 111, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 121, 111, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 121, 111, 16, 1, num_runs, precision)),
list(c(2048, 2048, 102, 121, 111, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 101, 121, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 121, 112, 1, 0, num_runs, precision),
c(1024, 1024, 101, 122, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 122, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 121, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 122, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 122, 112, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 102, 121, 111, 1, 0, num_runs, precision),
c( 16, 16, 102, 121, 111, 1, 0, num_runs, precision),
c( 32, 32, 102, 121, 111, 1, 0, num_runs, precision),
c( 64, 64, 102, 121, 111, 1, 0, num_runs, precision),
c( 128, 128, 102, 121, 111, 1, 0, num_runs, precision),
c( 256, 256, 102, 121, 111, 1, 0, num_runs, precision),
c( 512, 512, 102, 121, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
c(2048, 2048, 102, 121, 111, 1, 0, num_runs, precision),
c(4096, 4096, 102, 121, 111, 1, 0, num_runs, precision),
c(8192, 8192, 102, 121, 111, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"matrix sizes (n=k)",
"layout (row/col), triangle (u/l), transA (n/y)",
"matrix sizes (n=k)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("n", ""),
c("n", ""),
c("n", ""),
c("n", ""),
list(1:8, c("row,u,n", "row,u,y", "row,l,n", "row,l,y",
"col,u,n", "col,u,y", "col,l,n", "col,l,y")),
c("n", "x")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
# ==================================================================================================

View File

@ -1,127 +0,0 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for the Xtrmm routine
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xtrmm"
parameters <- c("-m","-n","-layout","-side","-triangle","-transA","-diagonal",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"multiples of 128",
"multiples of 128 (+1)",
"around m=n=512",
"around m=n=2048",
"layouts and side/triangle (m=n=1024)",
"powers of 2"
)
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 141, 121, 111, 131, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)),
list(c(2048, 2048, 102, 141, 121, 111, 131, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 101, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 121, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 121, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 121, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 122, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 122, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 122, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 122, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 121, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 121, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 121, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 121, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 122, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 122, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 122, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 122, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 122, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 122, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 122, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 122, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 121, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 121, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 121, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 121, 112, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 122, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 122, 111, 132, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 122, 112, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 122, 112, 132, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 16, 16, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 32, 32, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 64, 64, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 128, 128, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 256, 256, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c( 512, 512, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(2048, 2048, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(4096, 4096, 102, 141, 121, 111, 131, 1, 0, num_runs, precision),
c(8192, 8192, 102, 141, 121, 111, 131, 1, 0, num_runs, precision)
)
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"matrix sizes (m=n)",
"matrix sizes (m=n)",
"matrix sizes (m=n)",
"matrix sizes (m=n)",
"layout (row/col), side (l/r), triangle (up/lo), transA (n/y), diag (u/nu)",
"matrix sizes (m=n)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("m", ""),
c("m", ""),
c("m", ""),
c("m", ""),
list(1:32, c("row,l,up,n,u", "row,l,up,n,nu", "row,l,up,y,u", "row,l,up,y,nu",
"row,r,up,n,u", "row,r,up,n,nu", "row,r,up,y,u", "row,r,up,y,nu",
"row,l,lo,n,u", "row,l,lo,n,nu", "row,l,lo,y,u", "row,l,lo,y,nu",
"row,r,lo,n,u", "row,r,lo,n,nu", "row,r,lo,y,u", "row,r,lo,y,nu",
"col,l,up,n,u", "col,l,up,n,nu", "col,l,up,y,u", "col,l,up,y,nu",
"col,r,up,n,u", "col,r,up,n,nu", "col,r,up,y,u", "col,r,up,y,nu",
"col,l,lo,n,u", "col,l,lo,n,nu", "col,l,lo,y,u", "col,l,lo,y,nu",
"col,r,lo,n,u", "col,r,lo,n,nu", "col,r,lo,y,u", "col,r,lo,y,nu")),
c("m", "x")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
# ==================================================================================================

View File

@ -15,108 +15,116 @@
#include <vector>
#include <mutex>
#include "database/database.hpp"
#include "cache.hpp"
namespace clblast {
// =================================================================================================
// Stores the compiled binary or IR in the cache
void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
const Precision &precision, const std::string &routine_name) {
#ifdef VERBOSE
printf("[DEBUG] Storing binary in cache\n");
#endif
binary_cache_mutex_.lock();
binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
binary_cache_mutex_.unlock();
}
template <typename Key, typename Value>
template <typename U>
Value Cache<Key, Value>::Get(const U &key, bool *in_cache) const {
std::lock_guard<std::mutex> lock(cache_mutex_);
// Stores the compiled program in the cache
void StoreProgramToCache(const Program &program, const Context &context,
const Precision &precision, const std::string &routine_name) {
#ifdef VERBOSE
printf("[DEBUG] Storing program in cache\n");
#endif
program_cache_mutex_.lock();
program_cache_.push_back(ProgramCache{program, context(), precision, routine_name});
program_cache_mutex_.unlock();
}
// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws
// otherwise.
const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
const std::string &routine_name) {
#ifdef VERBOSE
printf("[DEBUG] Retrieving binary from cache\n");
#endif
binary_cache_mutex_.lock();
for (auto &cached_binary: binary_cache_) {
if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
binary_cache_mutex_.unlock();
return cached_binary.binary;
#if __cplusplus >= 201402L
// generalized std::map::find() of C++14
auto it = cache_.find(key);
#else
// O(n) lookup in a vector
auto it = std::find_if(cache_.begin(), cache_.end(), [&] (const std::pair<Key, Value> &pair) {
return pair.first == key;
});
#endif
if (it == cache_.end()) {
if (in_cache) {
*in_cache = false;
}
return Value();
}
binary_cache_mutex_.unlock();
throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
if (in_cache) {
*in_cache = true;
}
return it->second;
}
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
// otherwise.
const Program& GetProgramFromCache(const Context &context, const Precision &precision,
const std::string &routine_name) {
#ifdef VERBOSE
printf("[DEBUG] Retrieving program from cache\n");
#endif
program_cache_mutex_.lock();
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(context(), precision, routine_name)) {
program_cache_mutex_.unlock();
return cached_program.program;
}
template <typename Key, typename Value>
void Cache<Key, Value>::Store(Key &&key, Value &&value) {
std::lock_guard<std::mutex> lock(cache_mutex_);
#if __cplusplus >= 201402L
// emplace() into a map
auto r = cache_.emplace(std::move(key), std::move(value));
if (!r.second) {
throw LogicError("Cache::Store: object already in cache");
}
program_cache_mutex_.unlock();
throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
#else
// emplace_back() into a vector
cache_.emplace_back(std::move(key), std::move(value));
#endif
}
// Queries the cache to see whether or not the compiled kernel is already there
bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
const std::string &routine_name) {
binary_cache_mutex_.lock();
for (auto &cached_binary: binary_cache_) {
if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
binary_cache_mutex_.unlock();
return true;
template <typename Key, typename Value>
void Cache<Key, Value>::Remove(const Key &key) {
std::lock_guard<std::mutex> lock(cache_mutex_);
#if __cplusplus >= 201402L
cache_.erase(key);
#else
auto it = cache_.begin();
while (it != cache_.end()) {
if ((*it).first == key) {
it = cache_.erase(it);
}
else ++it;
}
binary_cache_mutex_.unlock();
return false;
#endif
}
// Queries the cache to see whether or not the compiled kernel is already there
bool ProgramIsInCache(const Context &context, const Precision &precision,
const std::string &routine_name) {
program_cache_mutex_.lock();
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(context(), precision, routine_name)) {
program_cache_mutex_.unlock();
return true;
template <typename Key, typename Value>
template <int I1, int I2>
void Cache<Key, Value>::RemoveBySubset(const Key &key) {
std::lock_guard<std::mutex> lock(cache_mutex_);
auto it = cache_.begin();
while (it != cache_.end()) {
const auto current_key = (*it).first;
if ((std::get<I1>(key) == std::get<I1>(current_key)) &&
(std::get<I2>(key) == std::get<I2>(current_key))) {
it = cache_.erase(it);
}
else ++it;
}
program_cache_mutex_.unlock();
return false;
}
template <typename Key, typename Value>
void Cache<Key, Value>::Invalidate() {
std::lock_guard<std::mutex> lock(cache_mutex_);
cache_.clear();
}
template <typename Key, typename Value>
Cache<Key, Value> &Cache<Key, Value>::Instance() {
return instance_;
}
template <typename Key, typename Value>
Cache<Key, Value> Cache<Key, Value>::instance_;
// =================================================================================================
// Clears the cache of stored binaries and programs
void CacheClearAll() {
binary_cache_mutex_.lock();
binary_cache_.clear();
binary_cache_mutex_.unlock();
program_cache_mutex_.lock();
program_cache_.clear();
program_cache_mutex_.unlock();
}
template class Cache<BinaryKey, std::string>;
template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
// =================================================================================================
template class Cache<ProgramKey, Program>;
template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name
// =================================================================================================
template class Cache<DatabaseKey, Database>;
template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const;
// =================================================================================================
} // namespace clblast

View File

@ -15,81 +15,92 @@
#define CLBLAST_CACHE_H_
#include <string>
#include <vector>
#include <mutex>
#include <map>
#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
// The cache of compiled OpenCL binaries, along with some meta-data
struct BinaryCache {
std::string binary;
std::string device_name;
Precision precision;
std::string routine_name_;
// The generic thread-safe cache. We assume that the Key may be a heavyweight struct that is not
// normally used by the caller, while the Value is either lightweight or ref-counted.
// Hence, searching by non-Key is supported (if there is a corresponding operator<()), and
// on Store() the Key instance is moved from the caller (because it will likely be constructed
// as temporary at the time of Store()).
template <typename Key, typename Value>
class Cache {
public:
// Cached object is returned by-value to avoid racing with Invalidate().
// Due to lack of std::optional<>, in case of a cache miss we return a default-constructed
// Value and set the flag to false.
template <typename U>
Value Get(const U &key, bool *in_cache) const;
// Finds out whether the properties match
bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
const std::string &ref_routine) {
return (device_name == ref_device &&
precision == ref_precision &&
routine_name_ == ref_routine);
}
};
// We do not return references to just stored object to avoid racing with Invalidate().
// Caller is expected to store a temporary.
void Store(Key &&key, Value &&value);
void Invalidate();
// The actual cache, implemented as a vector of the above data-type, and its mutex
static std::vector<BinaryCache> binary_cache_;
static std::mutex binary_cache_mutex_;
// Removes all entries with a given key
void Remove(const Key &key);
template <int I1, int I2> void RemoveBySubset(const Key &key); // currently supports 2 indices
static Cache<Key, Value> &Instance();
private:
#if __cplusplus >= 201402L
// The std::less<void> allows to search in cache by an object comparable with Key, without
// constructing a temporary Key
// (see http://en.cppreference.com/w/cpp/utility/functional/less_void,
// http://www.open-std.org/JTC1/SC22/WG21/docs/papers/2013/n3657.htm,
// http://stackoverflow.com/questions/10536788/avoiding-key-construction-for-stdmapfind)
std::map<Key, Value, std::less<void>> cache_;
#else
std::vector<std::pair<Key, Value>> cache_;
#endif
mutable std::mutex cache_mutex_;
static Cache<Key, Value> instance_;
}; // class Cache
// =================================================================================================
// The cache of compiled OpenCL programs, along with some meta-data
struct ProgramCache {
Program program;
cl_context context;
Precision precision;
std::string routine_name_;
// The key struct for the cache of compiled OpenCL binaries
// Order of fields: precision, routine_name, device_name (smaller fields first)
typedef std::tuple<Precision, std::string, std::string> BinaryKey;
typedef std::tuple<const Precision &, const std::string &, const std::string &> BinaryKeyRef;
// Finds out whether the properties match
bool MatchInCache(const cl_context ref_context, const Precision &ref_precision,
const std::string &ref_routine) {
return (context == ref_context &&
precision == ref_precision &&
routine_name_ == ref_routine);
}
};
typedef Cache<BinaryKey, std::string> BinaryCache;
// The actual cache, implemented as a vector of the above data-type, and its mutex
static std::vector<ProgramCache> program_cache_;
static std::mutex program_cache_mutex_;
extern template class Cache<BinaryKey, std::string>;
extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
// =================================================================================================
// Stores the compiled binary or program in the cache
void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
const Precision &precision, const std::string &routine_name);
void StoreProgramToCache(const Program &program, const Context &context,
const Precision &precision, const std::string &routine_name);
// The key struct for the cache of compiled OpenCL programs (context-dependent)
// Order of fields: context, precision, routine_name (smaller fields first)
typedef std::tuple<cl_context, Precision, std::string> ProgramKey;
typedef std::tuple<const cl_context &, const Precision &, const std::string &> ProgramKeyRef;
// Queries the cache and retrieves a matching binary or program. Assumes that the match is
// available, throws otherwise.
const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
const std::string &routine_name);
const Program& GetProgramFromCache(const Context &context, const Precision &precision,
const std::string &routine_name);
typedef Cache<ProgramKey, Program> ProgramCache;
// Queries the cache to see whether or not the compiled kernel is already there
bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
const std::string &routine_name);
bool ProgramIsInCache(const Context &context, const Precision &precision,
const std::string &routine_name);
extern template class Cache<ProgramKey, Program>;
extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
// =================================================================================================
// Clears the cache of stored binaries
void CacheClearAll();
class Database;
// The key struct for the cache of database maps.
// Order of fields: precision, device_name, kernel_name (smaller fields first)
typedef std::tuple<Precision, std::string, std::string> DatabaseKey;
typedef std::tuple<const Precision &, const std::string &, const std::string &> DatabaseKeyRef;
typedef Cache<DatabaseKey, Database> DatabaseCache;
extern template class Cache<DatabaseKey, Database>;
extern template Database DatabaseCache::Get(const DatabaseKeyRef &, bool *) const;
// =================================================================================================
} // namespace clblast

View File

@ -15,8 +15,8 @@
#include <string>
#include "clblast.h"
#include "cache.hpp"
#include "clblast.h"
// BLAS level-1 includes
#include "routines/level1/xswap.hpp"
@ -45,6 +45,7 @@
#include "routines/level2/xtrmv.hpp"
#include "routines/level2/xtbmv.hpp"
#include "routines/level2/xtpmv.hpp"
#include "routines/level2/xtrsv.hpp"
#include "routines/level2/xger.hpp"
#include "routines/level2/xgeru.hpp"
#include "routines/level2/xgerc.hpp"
@ -66,9 +67,12 @@
#include "routines/level3/xsyr2k.hpp"
#include "routines/level3/xher2k.hpp"
#include "routines/level3/xtrmm.hpp"
#include "routines/level3/xtrsm.hpp"
// Level-x includes (non-BLAS)
#include "routines/levelx/xomatcopy.hpp"
#include "routines/levelx/xaxpybatched.hpp"
#include "routines/levelx/xgemmbatched.hpp"
namespace clblast {
@ -1145,12 +1149,20 @@ template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Tr
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
template <typename T>
StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
try {
auto queue_cpp = Queue(*queue);
auto routine = Xtrsv<T>(queue_cpp, event);
routine.DoTrsv(layout, triangle, a_transpose, diagonal,
n,
Buffer<T>(a_buffer), a_offset, a_ld,
Buffer<T>(x_buffer), x_offset, x_inc);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
@ -2065,15 +2077,24 @@ template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triang
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
template <typename T>
StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
const T,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
try {
auto queue_cpp = Queue(*queue);
auto routine = Xtrsm<T>(queue_cpp, event);
routine.DoTrsm(layout, side, triangle, a_transpose, diagonal,
m, n,
alpha,
Buffer<T>(a_buffer), a_offset, a_ld,
Buffer<T>(b_buffer), b_offset, b_ld);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
@ -2099,12 +2120,6 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// Extra non-BLAS routines (level-X)
@ -2160,16 +2175,222 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
template <typename T>
StatusCode AxpyBatched(const size_t n,
const T *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
try {
auto queue_cpp = Queue(*queue);
auto routine = XaxpyBatched<T>(queue_cpp, event);
auto alphas_cpp = std::vector<T>();
auto x_offsets_cpp = std::vector<size_t>();
auto y_offsets_cpp = std::vector<size_t>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(alphas[batch]);
x_offsets_cpp.push_back(x_offsets[batch]);
y_offsets_cpp.push_back(y_offsets[batch]);
}
routine.DoAxpyBatched(n,
alphas_cpp,
Buffer<T>(x_buffer), x_offsets_cpp, x_inc,
Buffer<T>(y_buffer), y_offsets_cpp, y_inc,
batch_count);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API AxpyBatched<float>(const size_t,
const float*,
const cl_mem, const size_t*, const size_t,
cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<double>(const size_t,
const double*,
const cl_mem, const size_t*, const size_t,
cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t,
const float2*,
const cl_mem, const size_t*, const size_t,
cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t,
const double2*,
const cl_mem, const size_t*, const size_t,
cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API AxpyBatched<half>(const size_t,
const half*,
const cl_mem, const size_t*, const size_t,
cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
template <typename T>
StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const T *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const T *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
try {
auto queue_cpp = Queue(*queue);
auto routine = XgemmBatched<T>(queue_cpp, event);
auto alphas_cpp = std::vector<T>();
auto betas_cpp = std::vector<T>();
auto a_offsets_cpp = std::vector<size_t>();
auto b_offsets_cpp = std::vector<size_t>();
auto c_offsets_cpp = std::vector<size_t>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(alphas[batch]);
betas_cpp.push_back(betas[batch]);
a_offsets_cpp.push_back(a_offsets[batch]);
b_offsets_cpp.push_back(b_offsets[batch]);
c_offsets_cpp.push_back(c_offsets[batch]);
}
routine.DoGemmBatched(layout, a_transpose, b_transpose,
m, n, k,
alphas_cpp,
Buffer<T>(a_buffer), a_offsets_cpp, a_ld,
Buffer<T>(b_buffer), b_offsets_cpp, b_ld,
betas_cpp,
Buffer<T>(c_buffer), c_offsets_cpp, c_ld,
batch_count);
return StatusCode::kSuccess;
} catch (...) { return DispatchException(); }
}
template StatusCode PUBLIC_API GemmBatched<float>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
const float*,
const cl_mem, const size_t*, const size_t,
const cl_mem, const size_t*, const size_t,
const float*,
cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API GemmBatched<double>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
const double*,
const cl_mem, const size_t*, const size_t,
const cl_mem, const size_t*, const size_t,
const double*,
cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API GemmBatched<float2>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
const float2*,
const cl_mem, const size_t*, const size_t,
const cl_mem, const size_t*, const size_t,
const float2*,
cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API GemmBatched<double2>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
const double2*,
const cl_mem, const size_t*, const size_t,
const cl_mem, const size_t*, const size_t,
const double2*,
cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API GemmBatched<half>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
const half*,
const cl_mem, const size_t*, const size_t,
const cl_mem, const size_t*, const size_t,
const half*,
cl_mem, const size_t*, const size_t,
const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// Clears the cache of stored binaries
StatusCode ClearCache() {
try {
CacheClearAll();
ProgramCache::Instance().Invalidate();
BinaryCache::Instance().Invalidate();
} catch (...) { return DispatchException(); }
return StatusCode::kSuccess;
}
template <typename Real, typename Complex>
void FillCacheForPrecision(Queue &queue) {
try {
// Runs all the level 1 set-up functions
Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
Xswap<Real>(queue, nullptr); Xswap<Complex>(queue, nullptr);
Xscal<Real>(queue, nullptr); Xscal<Complex>(queue, nullptr);
Xcopy<Real>(queue, nullptr); Xcopy<Complex>(queue, nullptr);
Xaxpy<Real>(queue, nullptr); Xaxpy<Complex>(queue, nullptr);
Xdot<Real>(queue, nullptr);
Xdotu<Complex>(queue, nullptr);
Xdotc<Complex>(queue, nullptr);
Xnrm2<Real>(queue, nullptr); Xnrm2<Complex>(queue, nullptr);
Xasum<Real>(queue, nullptr); Xasum<Complex>(queue, nullptr);
Xsum<Real>(queue, nullptr); Xsum<Complex>(queue, nullptr);
Xamax<Real>(queue, nullptr); Xamax<Complex>(queue, nullptr);
Xmax<Real>(queue, nullptr); Xmax<Complex>(queue, nullptr);
Xmin<Real>(queue, nullptr); Xmin<Complex>(queue, nullptr);
// Runs all the level 2 set-up functions
Xgemv<Real>(queue, nullptr); Xgemv<Complex>(queue, nullptr);
Xgbmv<Real>(queue, nullptr); Xgbmv<Complex>(queue, nullptr);
Xhemv<Complex>(queue, nullptr);
Xhbmv<Complex>(queue, nullptr);
Xhpmv<Complex>(queue, nullptr);
Xsymv<Real>(queue, nullptr);
Xsbmv<Real>(queue, nullptr);
Xspmv<Real>(queue, nullptr);
Xtrmv<Real>(queue, nullptr); Xtrmv<Complex>(queue, nullptr);
Xtbmv<Real>(queue, nullptr); Xtbmv<Complex>(queue, nullptr);
Xtpmv<Real>(queue, nullptr); Xtpmv<Complex>(queue, nullptr);
Xger<Real>(queue, nullptr);
Xgeru<Complex>(queue, nullptr);
Xgerc<Complex>(queue, nullptr);
Xher<Complex,Real>(queue, nullptr);
Xhpr<Complex,Real>(queue, nullptr);
Xher2<Complex>(queue, nullptr);
Xhpr2<Complex>(queue, nullptr);
Xsyr<Real>(queue, nullptr);
Xspr<Real>(queue, nullptr);
Xsyr2<Real>(queue, nullptr);
Xspr2<Real>(queue, nullptr);
// Runs all the level 3 set-up functions
Xgemm<Real>(queue, nullptr); Xgemm<Complex>(queue, nullptr);
Xsymm<Real>(queue, nullptr); Xsymm<Complex>(queue, nullptr);
Xhemm<Complex>(queue, nullptr);
Xsyrk<Real>(queue, nullptr); Xsyrk<Complex>(queue, nullptr);
Xherk<Complex,Real>(queue, nullptr);
Xsyr2k<Real>(queue, nullptr); Xsyr2k<Complex>(queue, nullptr);
Xher2k<Complex,Real>(queue, nullptr);
Xtrmm<Real>(queue, nullptr); Xtrmm<Complex>(queue, nullptr);
// Runs all the non-BLAS set-up functions
Xomatcopy<Real>(queue, nullptr); Xomatcopy<Complex>(queue, nullptr);
} catch(const RuntimeErrorCode &e) {
if (e.status() != StatusCode::kNoDoublePrecision &&
e.status() != StatusCode::kNoHalfPrecision) {
throw;
}
}
}
// Fills the cache with all binaries for a specific device
// TODO: Add half-precision FP16 set-up calls
StatusCode FillCache(const cl_device_id device) {
@ -2180,58 +2401,52 @@ StatusCode FillCache(const cl_device_id device) {
auto context = Context(device_cpp);
auto queue = Queue(context, device_cpp);
// Runs all the level 1 set-up functions
Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
Xswap<float>(queue, nullptr); Xswap<double>(queue, nullptr); Xswap<float2>(queue, nullptr); Xswap<double2>(queue, nullptr);
Xscal<float>(queue, nullptr); Xscal<double>(queue, nullptr); Xscal<float2>(queue, nullptr); Xscal<double2>(queue, nullptr);
Xcopy<float>(queue, nullptr); Xcopy<double>(queue, nullptr); Xcopy<float2>(queue, nullptr); Xcopy<double2>(queue, nullptr);
Xaxpy<float>(queue, nullptr); Xaxpy<double>(queue, nullptr); Xaxpy<float2>(queue, nullptr); Xaxpy<double2>(queue, nullptr);
Xdot<float>(queue, nullptr); Xdot<double>(queue, nullptr);
Xdotu<float2>(queue, nullptr); Xdotu<double2>(queue, nullptr);
Xdotc<float2>(queue, nullptr); Xdotc<double2>(queue, nullptr);
Xnrm2<float>(queue, nullptr); Xnrm2<double>(queue, nullptr); Xnrm2<float2>(queue, nullptr); Xnrm2<double2>(queue, nullptr);
Xasum<float>(queue, nullptr); Xasum<double>(queue, nullptr); Xasum<float2>(queue, nullptr); Xasum<double2>(queue, nullptr);
Xsum<float>(queue, nullptr); Xsum<double>(queue, nullptr); Xsum<float2>(queue, nullptr); Xsum<double2>(queue, nullptr);
Xamax<float>(queue, nullptr); Xamax<double>(queue, nullptr); Xamax<float2>(queue, nullptr); Xamax<double2>(queue, nullptr);
Xmax<float>(queue, nullptr); Xmax<double>(queue, nullptr); Xmax<float2>(queue, nullptr); Xmax<double2>(queue, nullptr);
Xmin<float>(queue, nullptr); Xmin<double>(queue, nullptr); Xmin<float2>(queue, nullptr); Xmin<double2>(queue, nullptr);
FillCacheForPrecision<float, float2>(queue);
FillCacheForPrecision<double, double2>(queue);
// Runs all the level 2 set-up functions
Xgemv<float>(queue, nullptr); Xgemv<double>(queue, nullptr); Xgemv<float2>(queue, nullptr); Xgemv<double2>(queue, nullptr);
Xgbmv<float>(queue, nullptr); Xgbmv<double>(queue, nullptr); Xgbmv<float2>(queue, nullptr); Xgbmv<double2>(queue, nullptr);
Xhemv<float2>(queue, nullptr); Xhemv<double2>(queue, nullptr);
Xhbmv<float2>(queue, nullptr); Xhbmv<double2>(queue, nullptr);
Xhpmv<float2>(queue, nullptr); Xhpmv<double2>(queue, nullptr);
Xsymv<float>(queue, nullptr); Xsymv<double>(queue, nullptr);
Xsbmv<float>(queue, nullptr); Xsbmv<double>(queue, nullptr);
Xspmv<float>(queue, nullptr); Xspmv<double>(queue, nullptr);
Xtrmv<float>(queue, nullptr); Xtrmv<double>(queue, nullptr); Xtrmv<float2>(queue, nullptr); Xtrmv<double2>(queue, nullptr);
Xtbmv<float>(queue, nullptr); Xtbmv<double>(queue, nullptr); Xtbmv<float2>(queue, nullptr); Xtbmv<double2>(queue, nullptr);
Xtpmv<float>(queue, nullptr); Xtpmv<double>(queue, nullptr); Xtpmv<float2>(queue, nullptr); Xtpmv<double2>(queue, nullptr);
Xger<float>(queue, nullptr); Xger<double>(queue, nullptr);
Xgeru<float2>(queue, nullptr); Xgeru<double2>(queue, nullptr);
Xgerc<float2>(queue, nullptr); Xgerc<double2>(queue, nullptr);
Xher<float2,float>(queue, nullptr); Xher<double2,double>(queue, nullptr);
Xhpr<float2,float>(queue, nullptr); Xhpr<double2,double>(queue, nullptr);
Xher2<float2>(queue, nullptr); Xher2<double2>(queue, nullptr);
Xhpr2<float2>(queue, nullptr); Xhpr2<double2>(queue, nullptr);
Xsyr<float>(queue, nullptr); Xsyr<double>(queue, nullptr);
Xspr<float>(queue, nullptr); Xspr<double>(queue, nullptr);
Xsyr2<float>(queue, nullptr); Xsyr2<double>(queue, nullptr);
Xspr2<float>(queue, nullptr); Xspr2<double>(queue, nullptr);
} catch (...) { return DispatchException(); }
return StatusCode::kSuccess;
}
// Runs all the level 3 set-up functions
Xgemm<float>(queue, nullptr); Xgemm<double>(queue, nullptr); Xgemm<float2>(queue, nullptr); Xgemm<double2>(queue, nullptr);
Xsymm<float>(queue, nullptr); Xsymm<double>(queue, nullptr); Xsymm<float2>(queue, nullptr); Xsymm<double2>(queue, nullptr);
Xhemm<float2>(queue, nullptr); Xhemm<double2>(queue, nullptr);
Xsyrk<float>(queue, nullptr); Xsyrk<double>(queue, nullptr); Xsyrk<float2>(queue, nullptr); Xsyrk<double2>(queue, nullptr);
Xherk<float2,float>(queue, nullptr); Xherk<double2,double>(queue, nullptr);
Xsyr2k<float>(queue, nullptr); Xsyr2k<double>(queue, nullptr); Xsyr2k<float2>(queue, nullptr); Xsyr2k<double2>(queue, nullptr);
Xher2k<float2,float>(queue, nullptr); Xher2k<double2,double>(queue, nullptr);
Xtrmm<float>(queue, nullptr); Xtrmm<double>(queue, nullptr); Xtrmm<float2>(queue, nullptr); Xtrmm<double2>(queue, nullptr);
// =================================================================================================
// Runs all the level 3 set-up functions
Xomatcopy<float>(queue, nullptr); Xomatcopy<double>(queue, nullptr); Xomatcopy<float2>(queue, nullptr); Xomatcopy<double2>(queue, nullptr);
// Overrides the tuning parameters for this device-precision-kernel combination
StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name,
const Precision precision,
const std::unordered_map<std::string,size_t> &parameters) {
try {
// Retrieves the device name
const auto device_cpp = Device(device);
const auto device_name = device_cpp.Name();
// Retrieves the current database values to verify whether the new ones are complete
auto in_cache = false;
const auto current_database = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision, device_name, kernel_name }, &in_cache);
if (!in_cache) { return StatusCode::kInvalidOverrideKernel; }
for (const auto &current_param : current_database.GetParameterNames()) {
if (parameters.find(current_param) == parameters.end()) {
return StatusCode::kMissingOverrideParameter;
}
}
// Clears the existing program & binary cache for routines with the target kernel
const auto routine_names = Routine::routines_by_kernel.at(kernel_name);
for (const auto &routine_name : routine_names) {
ProgramCache::Instance().RemoveBySubset<1, 2>(ProgramKey{nullptr, precision, routine_name});
BinaryCache::Instance().Remove(BinaryKey{precision, routine_name, device_name});
}
// Creates a small custom database based on the provided parameters
const auto database_device = Database::DatabaseDevice{"default", parameters};
const auto database_vendor = Database::DatabaseVendor{database::kDeviceTypeAll, "default", {database_device}};
const auto database_entry = Database::DatabaseEntry{kernel_name, precision, {database_vendor}};
const auto database_entries = std::vector<Database::DatabaseEntry>{database_entry};
const auto database = Database(device_cpp, kernel_name, precision, database_entries);
// Removes the old database entry and stores the new one in the cache
DatabaseCache::Instance().Remove(DatabaseKey{ precision, device_name, kernel_name });
DatabaseCache::Instance().Store(DatabaseKey{ precision, device_name, kernel_name }, Database(database));
} catch (...) { return DispatchException(); }
return StatusCode::kSuccess;

View File

@ -12,12 +12,14 @@
// =================================================================================================
#include <string>
#include <unordered_map>
#include "utilities/utilities.hpp"
#include "clblast_c.h"
#include "clblast.h"
#include "utilities/utilities.hpp"
// Shortcuts to the clblast namespace
using half = clblast::half;
using float2 = clblast::float2;
using double2 = clblast::double2;
@ -3349,27 +3351,6 @@ CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide sid
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastHtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
try {
return static_cast<CLBlastStatusCode>(
clblast::Trsm(static_cast<clblast::Layout>(layout),
static_cast<clblast::Side>(side),
static_cast<clblast::Triangle>(triangle),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Diagonal>(diagonal),
m, n,
alpha,
a_buffer, a_offset, a_ld,
b_buffer, b_offset, b_ld,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// =================================================================================================
// Extra non-BLAS routines (level-X)
@ -3467,6 +3448,270 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// AXPY
CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
const float *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<float>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(alphas[batch]);
}
try {
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
x_buffer, x_offsets, x_inc,
y_buffer, y_offsets, y_inc,
batch_count,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
const double *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<double>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(alphas[batch]);
}
try {
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
x_buffer, x_offsets, x_inc,
y_buffer, y_offsets, y_inc,
batch_count,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
const cl_float2 *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<float2>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]});
}
try {
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
x_buffer, x_offsets, x_inc,
y_buffer, y_offsets, y_inc,
batch_count,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
const cl_double2 *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<double2>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]});
}
try {
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
x_buffer, x_offsets, x_inc,
y_buffer, y_offsets, y_inc,
batch_count,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
const cl_half *alphas,
const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<half>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(alphas[batch]);
}
try {
return static_cast<CLBlastStatusCode>(
clblast::AxpyBatched(n,
alphas_cpp.data(),
x_buffer, x_offsets, x_inc,
y_buffer, y_offsets, y_inc,
batch_count,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// GEMM
CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const float *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const float *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<float>();
auto betas_cpp = std::vector<float>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(alphas[batch]);
betas_cpp.push_back(betas[batch]);
}
try {
return static_cast<CLBlastStatusCode>(
clblast::GemmBatched(static_cast<clblast::Layout>(layout),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Transpose>(b_transpose),
m, n, k,
alphas_cpp.data(),
a_buffer, a_offsets, a_ld,
b_buffer, b_offsets, b_ld,
betas_cpp.data(),
c_buffer, c_offsets, c_ld,
batch_count,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const double *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const double *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<double>();
auto betas_cpp = std::vector<double>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(alphas[batch]);
betas_cpp.push_back(betas[batch]);
}
try {
return static_cast<CLBlastStatusCode>(
clblast::GemmBatched(static_cast<clblast::Layout>(layout),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Transpose>(b_transpose),
m, n, k,
alphas_cpp.data(),
a_buffer, a_offsets, a_ld,
b_buffer, b_offsets, b_ld,
betas_cpp.data(),
c_buffer, c_offsets, c_ld,
batch_count,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_float2 *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const cl_float2 *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<float2>();
auto betas_cpp = std::vector<float2>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]});
betas_cpp.push_back(float2{betas[batch].s[0], betas[batch].s[1]});
}
try {
return static_cast<CLBlastStatusCode>(
clblast::GemmBatched(static_cast<clblast::Layout>(layout),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Transpose>(b_transpose),
m, n, k,
alphas_cpp.data(),
a_buffer, a_offsets, a_ld,
b_buffer, b_offsets, b_ld,
betas_cpp.data(),
c_buffer, c_offsets, c_ld,
batch_count,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_double2 *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const cl_double2 *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<double2>();
auto betas_cpp = std::vector<double2>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]});
betas_cpp.push_back(double2{betas[batch].s[0], betas[batch].s[1]});
}
try {
return static_cast<CLBlastStatusCode>(
clblast::GemmBatched(static_cast<clblast::Layout>(layout),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Transpose>(b_transpose),
m, n, k,
alphas_cpp.data(),
a_buffer, a_offsets, a_ld,
b_buffer, b_offsets, b_ld,
betas_cpp.data(),
c_buffer, c_offsets, c_ld,
batch_count,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_half *alphas,
const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
const cl_half *betas,
cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
const size_t batch_count,
cl_command_queue* queue, cl_event* event) {
auto alphas_cpp = std::vector<half>();
auto betas_cpp = std::vector<half>();
for (auto batch = size_t{0}; batch < batch_count; ++batch) {
alphas_cpp.push_back(alphas[batch]);
betas_cpp.push_back(betas[batch]);
}
try {
return static_cast<CLBlastStatusCode>(
clblast::GemmBatched(static_cast<clblast::Layout>(layout),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Transpose>(b_transpose),
m, n, k,
alphas_cpp.data(),
a_buffer, a_offsets, a_ld,
b_buffer, b_offsets, b_ld,
betas_cpp.data(),
c_buffer, c_offsets, c_ld,
batch_count,
queue, event)
);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// =================================================================================================
// Clears the cache of stored binaries
@ -3484,3 +3729,23 @@ CLBlastStatusCode CLBlastFillCache(const cl_device_id device) {
}
// =================================================================================================
// Overrides the tuning parameters for this device-precision-kernel combination
CLBlastStatusCode PUBLIC_API CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
const CLBlastPrecision precision, const size_t num_parameters,
const char** parameters_names, const size_t* parameters_values) {
try {
const auto kernel_name_cpp = std::string(kernel_name);
const auto precision_cpp = static_cast<clblast::Precision>(precision);
auto parameters = std::unordered_map<std::string, size_t>();
for (auto i = size_t{0}; i < num_parameters; ++i) {
const auto parameter_name = std::string(parameters_names[i]);
const auto parameter_value = parameters_values[i];
parameters[parameter_name] = parameter_value;
}
const auto status = clblast::OverrideParameters(device, kernel_name_cpp, precision_cpp, parameters);
return static_cast<CLBlastStatusCode>(status);
} catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }
}
// =================================================================================================

View File

@ -164,6 +164,10 @@ class Platform {
platform_ = platforms[platform_id];
}
// Methods to retrieve platform information
std::string Name() const { return GetInfoString(CL_PLATFORM_NAME); }
std::string Vendor() const { return GetInfoString(CL_PLATFORM_VENDOR); }
// Returns the number of devices on this platform
size_t NumDevices() const {
auto result = cl_uint{0};
@ -175,6 +179,17 @@ class Platform {
const cl_platform_id& operator()() const { return platform_; }
private:
cl_platform_id platform_;
// Private helper functions
std::string GetInfoString(const cl_device_info info) const {
auto bytes = size_t{0};
CheckError(clGetPlatformInfo(platform_, info, 0, nullptr, &bytes));
auto result = std::string{};
result.resize(bytes);
CheckError(clGetPlatformInfo(platform_, info, bytes, &result[0], nullptr));
result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
return result;
}
};
// Retrieves a vector with all platforms
@ -333,7 +348,10 @@ class Context {
// Regular constructor with memory management
explicit Context(const Device &device):
context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) {
context_(new cl_context, [](cl_context* c) {
if (*c) { CheckErrorDtor(clReleaseContext(*c)); }
delete c;
}) {
auto status = CL_SUCCESS;
const cl_device_id dev = device();
*context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
@ -355,33 +373,37 @@ using ContextPointer = cl_context*;
// Enumeration of build statuses of the run-time compilation process
enum class BuildStatus { kSuccess, kError, kInvalid };
// C++11 version of 'cl_program'. Additionally holds the program's source code.
// C++11 version of 'cl_program'.
class Program {
public:
// Note that there is no constructor based on the regular OpenCL data-type because of extra state
Program() = default;
// Source-based constructor with memory management
explicit Program(const Context &context, std::string source):
program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
length_(source.length()),
source_(std::move(source)),
source_ptr_(&source_[0]) {
explicit Program(const Context &context, const std::string &source):
program_(new cl_program, [](cl_program* p) {
if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
delete p;
}) {
const char *source_ptr = &source[0];
size_t length = source.length();
auto status = CL_SUCCESS;
*program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
*program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
CLError::Check(status, "clCreateProgramWithSource");
}
// Binary-based constructor with memory management
explicit Program(const Device &device, const Context &context, const std::string& binary):
program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
length_(binary.length()),
source_(binary),
source_ptr_(&source_[0]) {
explicit Program(const Device &device, const Context &context, const std::string &binary):
program_(new cl_program, [](cl_program* p) {
if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
delete p;
}) {
const char *binary_ptr = &binary[0];
size_t length = binary.length();
auto status1 = CL_SUCCESS;
auto status2 = CL_SUCCESS;
const cl_device_id dev = device();
*program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
reinterpret_cast<const unsigned char**>(&source_ptr_),
*program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
reinterpret_cast<const unsigned char**>(&binary_ptr),
&status1, &status2);
CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
CLError::Check(status2, "clCreateProgramWithBinary");
@ -421,9 +443,6 @@ class Program {
const cl_program& operator()() const { return *program_; }
private:
std::shared_ptr<cl_program> program_;
size_t length_;
std::string source_; // Note: the source can also be a binary or IR
const char* source_ptr_;
};
// =================================================================================================
@ -440,8 +459,10 @@ class Queue {
// Regular constructor with memory management
explicit Queue(const Context &context, const Device &device):
queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s));
delete s; }) {
queue_(new cl_command_queue, [](cl_command_queue* s) {
if (*s) { CheckErrorDtor(clReleaseCommandQueue(*s)); }
delete s;
}) {
auto status = CL_SUCCESS;
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
CLError::Check(status, "clCreateCommandQueue");
@ -594,9 +615,6 @@ class Buffer {
// Copies from host to device: writing the device buffer a-synchronously
void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
if (access_ == BufferAccess::kReadOnly) {
throw LogicError("Buffer: writing to a read-only buffer");
}
if (GetSize() < (offset+size)*sizeof(T)) {
throw LogicError("Buffer: target device buffer is too small");
}
@ -665,7 +683,10 @@ class Kernel {
// Regular constructor with memory management
explicit Kernel(const Program &program, const std::string &name):
kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
kernel_(new cl_kernel, [](cl_kernel* k) {
if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
delete k;
}) {
auto status = CL_SUCCESS;
*kernel_ = clCreateKernel(program(), name.c_str(), &status);
CLError::Check(status, "clCreateKernel");

View File

@ -0,0 +1,70 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file provides overrides for Apple's OpenCL CPU implementation. It is a special case compared
// to all other implementations, as it only supports a 1-dimensional work-group size. In addition,
// that work-group size is limited to 1024 (in theory) or much lower (kernel resource dependent).
// Thus, instead of supporting this corner-case in the whole regular flow (starting from the tuner),
// we provide this file with some manual overrides.
//
// Note: These overrides are to make the Apple CPU work and not crash, they are not in any way
// optimized parameters. For decent speed don't use Apple's OpenCL CPU implementation.
//
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry XaxpyApple = {
"Xaxpy", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"VW",8}, {"WGS",1}, {"WPT",4} } } } } }
};
const Database::DatabaseEntry XdotApple = {
"Xdot", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"WGS1",1}, {"WGS2",1} } } } } }
};
const Database::DatabaseEntry XgemvApple = {
"Xgemv", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"WGS1",1}, {"WPT1",4}, {"UNROLL1", 1} } } } } }
};
const Database::DatabaseEntry XgemvFastApple = {
"XgemvFast", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"VW2",1}, {"WGS2",1}, {"WPT2",1} } } } } }
};
const Database::DatabaseEntry XgemvFastRotApple = {
"XgemvFastRot", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"VW3",1}, {"WGS3",1}, {"WPT3",1} } } } } }
};
const Database::DatabaseEntry XgerApple = {
"Xger", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } } } } }
};
const Database::DatabaseEntry XtrsvApple = {
"Xtrsv", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"TRSV_BLOCK_SIZE",32} } } } } }
};
const Database::DatabaseEntry XgemmApple = {
"Xgemm", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"KWG",1}, {"KWI",1}, {"MDIMA",1}, {"MDIMC",1}, {"MWG",1}, {"NDIMB",1}, {"NDIMC",1}, {"NWG",1}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } } } } }
};
const Database::DatabaseEntry XgemmDirectApple = {
"XgemmDirect", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"KWID",1}, {"MDIMAD",1}, {"MDIMCD",1}, {"NDIMBD",1}, {"NDIMCD",1}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",1} } } } } }
};
const Database::DatabaseEntry CopyApple = {
"Copy", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"COPY_DIMX",1}, {"COPY_DIMY",1}, {"COPY_VW",1}, {"COPY_WPT",1} } } } } }
};
const Database::DatabaseEntry PadApple = {
"Pad", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"PAD_DIMX",1}, {"PAD_DIMY",1}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } } } } }
};
const Database::DatabaseEntry TransposeApple = {
"Transpose", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"TRA_DIM",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } } } } }
};
const Database::DatabaseEntry PadtransposeApple = {
"Padtranspose", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",1}, {"PADTRA_WPT",1} } } } } }
};
const Database::DatabaseEntry InvertApple = {
"Invert", Precision::kAny, { { kDeviceTypeAll, "default", { { "default", { {"INTERNAL_BLOCK_SIZE",16} } } } } }
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -11,6 +11,8 @@
//
// =================================================================================================
#include <list>
#include "utilities/utilities.hpp"
#include "database/database.hpp"
@ -20,35 +22,47 @@
#include "database/kernels/xgemv_fast.hpp"
#include "database/kernels/xgemv_fast_rot.hpp"
#include "database/kernels/xger.hpp"
#include "database/kernels/xtrsv.hpp"
#include "database/kernels/xgemm.hpp"
#include "database/kernels/xgemm_direct.hpp"
#include "database/kernels/copy.hpp"
#include "database/kernels/pad.hpp"
#include "database/kernels/transpose.hpp"
#include "database/kernels/padtranspose.hpp"
#include "database/kernels/invert.hpp"
#include "database/apple_cpu_fallback.hpp"
#include "database/kernel_selection.hpp"
namespace clblast {
// =================================================================================================
// Initializes the database
const std::vector<const Database::DatabaseEntry*> Database::database = {
&database::XaxpyHalf, &database::XaxpySingle, &database::XaxpyDouble, &database::XaxpyComplexSingle, &database::XaxpyComplexDouble,
&database::XdotHalf, &database::XdotSingle, &database::XdotDouble, &database::XdotComplexSingle, &database::XdotComplexDouble,
&database::XgemvHalf, &database::XgemvSingle, &database::XgemvDouble, &database::XgemvComplexSingle, &database::XgemvComplexDouble,
&database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
&database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
&database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
&database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
&database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
&database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
&database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
&database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
&database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
&database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
// Initializes the databases
const std::vector<Database::DatabaseEntry> Database::database = std::vector<Database::DatabaseEntry>{
database::XaxpyHalf, database::XaxpySingle, database::XaxpyDouble, database::XaxpyComplexSingle, database::XaxpyComplexDouble,
database::XdotHalf, database::XdotSingle, database::XdotDouble, database::XdotComplexSingle, database::XdotComplexDouble,
database::XgemvHalf, database::XgemvSingle, database::XgemvDouble, database::XgemvComplexSingle, database::XgemvComplexDouble,
database::XgemvFastHalf, database::XgemvFastSingle, database::XgemvFastDouble, database::XgemvFastComplexSingle, database::XgemvFastComplexDouble,
database::XgemvFastRotHalf, database::XgemvFastRotSingle, database::XgemvFastRotDouble, database::XgemvFastRotComplexSingle, database::XgemvFastRotComplexDouble,
database::XgerHalf, database::XgerSingle, database::XgerDouble, database::XgerComplexSingle, database::XgerComplexDouble,
database::XtrsvHalf, database::XtrsvSingle, database::XtrsvDouble, database::XtrsvComplexSingle, database::XtrsvComplexDouble,
database::XgemmHalf, database::XgemmSingle, database::XgemmDouble, database::XgemmComplexSingle, database::XgemmComplexDouble,
database::XgemmDirectHalf, database::XgemmDirectSingle, database::XgemmDirectDouble, database::XgemmDirectComplexSingle, database::XgemmDirectComplexDouble,
database::CopyHalf, database::CopySingle, database::CopyDouble, database::CopyComplexSingle, database::CopyComplexDouble,
database::PadHalf, database::PadSingle, database::PadDouble, database::PadComplexSingle, database::PadComplexDouble,
database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble,
database::PadtransposeHalf, database::PadtransposeSingle, database::PadtransposeDouble, database::PadtransposeComplexSingle, database::PadtransposeComplexDouble,
database::InvertHalf, database::InvertSingle, database::InvertDouble, database::InvertComplexSingle, database::InvertComplexDouble,
database::KernelSelectionHalf, database::KernelSelectionSingle, database::KernelSelectionDouble, database::KernelSelectionComplexSingle, database::KernelSelectionComplexDouble
};
const std::vector<Database::DatabaseEntry> Database::apple_cpu_fallback = std::vector<Database::DatabaseEntry>{
database::XaxpyApple, database::XdotApple,
database::XgemvApple, database::XgemvFastApple, database::XgemvFastRotApple, database::XgerApple, database::XtrsvApple,
database::XgemmApple, database::XgemmDirectApple,
database::CopyApple, database::PadApple, database::TransposeApple, database::PadtransposeApple,
database::InvertApple
};
// The OpenCL device vendors
// The default values
const std::string Database::kDeviceVendorAll = "default";
// Alternative names for some OpenCL vendors
@ -63,12 +77,11 @@ const std::unordered_map<std::string, std::string> Database::kVendorNames{
// Constructor, computing device properties and populating the parameter-vector from the database.
// This takes an optional overlay database in case of custom tuning or custom kernels.
Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
const Precision precision, const std::vector<const DatabaseEntry*> &overlay):
parameters_{} {
Database::Database(const Device &device, const std::string &kernel_name,
const Precision precision, const std::vector<DatabaseEntry> &overlay):
parameters_(std::make_shared<Parameters>()) {
// Finds information of the current device
auto device = queue.GetDevice();
auto device_type = device.Type();
auto device_vendor = device.Vendor();
auto device_name = device.Name();
@ -80,20 +93,31 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
}
}
// Iterates over all kernels to include, and retrieves the parameters for each of them
for (auto &kernel: kernels) {
auto search_result = ParametersPtr{};
// Sets the databases to search through
auto databases = std::list<std::vector<DatabaseEntry>>{overlay, database};
for (auto &db: { database, overlay}) {
search_result = Search(kernel, device_type, device_vendor, device_name, precision, db);
if (search_result) {
parameters_.insert(search_result->begin(), search_result->end());
break;
// Special case: modifies the database if the device is a CPU with Apple OpenCL
#if defined(__APPLE__) || defined(__MACOSX)
if (device.Type() == "CPU") {
auto extensions = device.Capabilities();
const auto is_apple = (extensions.find("cl_APPLE_SetMemObjectDestructor") == std::string::npos) ? false : true;
if (is_apple) {
databases.push_front(apple_cpu_fallback);
}
}
#endif
if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
// Searches potentially multiple databases
auto search_result = ParametersPtr{};
for (auto &db: databases) {
search_result = Search(kernel_name, device_type, device_vendor, device_name, precision, db);
if (search_result) {
parameters_->insert(search_result->begin(), search_result->end());
break;
}
}
if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
}
// =================================================================================================
@ -101,12 +125,21 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
// Returns a list of OpenCL pre-processor defines in string form
std::string Database::GetDefines() const {
std::string defines{};
for (auto &parameter: parameters_) {
for (auto &parameter: *parameters_) {
defines += "#define "+parameter.first+" "+ToString(parameter.second)+"\n";
}
return defines;
}
// Retrieves the names of all the parameters
std::vector<std::string> Database::GetParameterNames() const {
auto parameter_names = std::vector<std::string>();
for (auto &parameter: *parameters_) {
parameter_names.push_back(parameter.first);
}
return parameter_names;
}
// =================================================================================================
// Searches a particular database for the right kernel and precision
@ -115,15 +148,16 @@ Database::ParametersPtr Database::Search(const std::string &this_kernel,
const std::string &this_vendor,
const std::string &this_device,
const Precision this_precision,
const std::vector<const DatabaseEntry*> &this_database) const {
const std::vector<DatabaseEntry> &this_database) const {
// Selects the right kernel
for (auto &db: this_database) {
if (db->kernel == this_kernel && db->precision == this_precision) {
if ((db.kernel == this_kernel) &&
(db.precision == this_precision || db.precision == Precision::kAny)) {
// Searches for the right vendor and device type, or selects the default if unavailable. This
// assumes that the default vendor / device type is last in the database.
for (auto &vendor: db->vendors) {
for (auto &vendor: db.vendors) {
if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
(vendor.type == this_type || vendor.type == database::kDeviceTypeAll)) {

View File

@ -70,27 +70,61 @@ class Database {
static const std::unordered_map<std::string, std::string> kVendorNames;
// The database consists of separate database entries, stored together in a vector
static const std::vector<const DatabaseEntry*> database;
static const std::vector<DatabaseEntry> database;
// Database for a special case: Apple CPUs support limited number of threads
static const std::vector<DatabaseEntry> apple_cpu_fallback;
Database() = default;
// The constructor with a user-provided database overlay (potentially an empty vector)
explicit Database(const Queue &queue, const std::vector<std::string> &routines,
const Precision precision, const std::vector<const DatabaseEntry*> &overlay);
explicit Database(const Device &device, const std::string &kernel_name,
const Precision precision, const std::vector<DatabaseEntry> &overlay);
// Accessor of values by key
size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
size_t operator[](const std::string &key) const { return parameters_->find(key)->second; }
bool exists(const std::string &key) const { return (parameters_->count(key) == 1); }
// Obtain a list of OpenCL pre-processor defines based on the parameters
std::string GetDefines() const;
// Retrieves the names of all the parameters
std::vector<std::string> GetParameterNames() const;
private:
// Search method for a specified database, returning pointer (possibly a nullptr)
ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
const std::string &this_vendor, const std::string &this_device,
const Precision this_precision,
const std::vector<const DatabaseEntry*> &db) const;
const std::vector<DatabaseEntry> &db) const;
// Found parameters suitable for this device/kernel
Parameters parameters_;
std::shared_ptr<Parameters> parameters_;
};
// =================================================================================================
// Multiple databases together in a map
class Databases {
public:
explicit Databases(const std::vector<std::string> &kernel_names): kernel_names_(kernel_names) { }
// Database accessor
Database& operator()(const std::string &kernel_name) { return databases_[kernel_name]; }
// Retrieves a parameter from the database
size_t operator[](const std::string &key) const {
for (const auto &kernel_name : kernel_names_) {
const auto &kernel_db = databases_.find(kernel_name)->second;
if (kernel_db.exists(key)) { return kernel_db[key]; }
}
throw RuntimeErrorCode(StatusCode::kDatabaseError);
}
private:
const std::vector<std::string> kernel_names_;
std::unordered_map<std::string, Database> databases_;
};
// =================================================================================================

View File

@ -22,13 +22,12 @@ const Database::DatabaseEntry KernelSelectionHalf = {
"KernelSelection", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
}
},
{ // Default
@ -45,13 +44,12 @@ const Database::DatabaseEntry KernelSelectionSingle = {
"KernelSelection", Precision::kSingle, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
}
},
{ // Default
@ -68,13 +66,12 @@ const Database::DatabaseEntry KernelSelectionComplexSingle = {
"KernelSelection", Precision::kComplexSingle, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
}
},
{ // Default
@ -91,13 +88,12 @@ const Database::DatabaseEntry KernelSelectionDouble = {
"KernelSelection", Precision::kDouble, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
}
},
{ // Default
@ -114,13 +110,12 @@ const Database::DatabaseEntry KernelSelectionComplexDouble = {
"KernelSelection", Precision::kComplexDouble, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",1280*1280*1280} } },
}
},
{ // Default

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry CopyHalf = {
"Copy", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
@ -26,7 +32,7 @@ const Database::DatabaseEntry CopyHalf = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
}
},
}
@ -39,12 +45,15 @@ const Database::DatabaseEntry CopySingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "ATI Radeon HD 6750M", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Ellesmere", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",8} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Tonga", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Turks", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
}
},
{ // ARM GPUs
@ -55,10 +64,12 @@ const Database::DatabaseEntry CopySingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
}
},
{ // Intel GPUs
@ -83,6 +94,7 @@ const Database::DatabaseEntry CopySingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 1080", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
@ -92,9 +104,10 @@ const Database::DatabaseEntry CopySingle = {
{ "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "TITAN X (Pascal)", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
}
},
{ // Default
@ -112,18 +125,23 @@ const Database::DatabaseEntry CopyComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "ATI Radeon HD 6750M", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Ellesmere", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Turks", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
@ -150,6 +168,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 1080", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@ -157,14 +176,15 @@ const Database::DatabaseEntry CopyComplexSingle = {
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "TITAN X (Pascal)", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
}
@ -177,12 +197,13 @@ const Database::DatabaseEntry CopyDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Ellesmere", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
}
},
{ // ARM GPUs
@ -193,10 +214,12 @@ const Database::DatabaseEntry CopyDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // Intel accelerators
@ -209,6 +232,7 @@ const Database::DatabaseEntry CopyDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 1080", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
@ -218,14 +242,15 @@ const Database::DatabaseEntry CopyDouble = {
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "TITAN X (Pascal)", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
}
@ -238,6 +263,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Ellesmere", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -254,10 +280,12 @@ const Database::DatabaseEntry CopyComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // Intel accelerators
@ -270,6 +298,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 1070", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "GeForce GTX 1080", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -279,6 +308,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "TITAN X (Pascal)", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },

View File

@ -0,0 +1,78 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// Tuning parameters for the diagonal matrix inversion kernels
//
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry InvertHalf = {
"Invert", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertSingle = {
"Invert", Precision::kSingle, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertComplexSingle = {
"Invert", Precision::kComplexSingle, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertDouble = {
"Invert", Precision::kDouble, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry InvertComplexDouble = {
"Invert", Precision::kComplexDouble, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"INTERNAL_BLOCK_SIZE",16} } },
}
},
}
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry PadHalf = {
"Pad", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
@ -39,12 +45,15 @@ const Database::DatabaseEntry PadSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "ATI Radeon HD 6750M", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Ellesmere", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "Turks", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // ARM GPUs
@ -55,8 +64,10 @@ const Database::DatabaseEntry PadSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
}
@ -83,6 +94,7 @@ const Database::DatabaseEntry PadSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1080", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
@ -92,9 +104,10 @@ const Database::DatabaseEntry PadSingle = {
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "TITAN X (Pascal)", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
}
},
{ // Default
@ -112,12 +125,15 @@ const Database::DatabaseEntry PadComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "ATI Radeon HD 6750M", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Ellesmere", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Turks", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // ARM GPUs
@ -128,10 +144,12 @@ const Database::DatabaseEntry PadComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
}
},
{ // Intel GPUs
@ -156,6 +174,7 @@ const Database::DatabaseEntry PadComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1080", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@ -165,6 +184,7 @@ const Database::DatabaseEntry PadComplexSingle = {
{ "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "TITAN X (Pascal)", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@ -172,7 +192,7 @@ const Database::DatabaseEntry PadComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
}
@ -185,12 +205,13 @@ const Database::DatabaseEntry PadDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Ellesmere", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tonga", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@ -201,8 +222,10 @@ const Database::DatabaseEntry PadDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
@ -217,6 +240,7 @@ const Database::DatabaseEntry PadDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1080", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@ -226,6 +250,7 @@ const Database::DatabaseEntry PadDouble = {
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "TITAN X (Pascal)", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -246,6 +271,7 @@ const Database::DatabaseEntry PadComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Ellesmere", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -262,10 +288,12 @@ const Database::DatabaseEntry PadComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
},
{ // Intel accelerators
@ -278,6 +306,7 @@ const Database::DatabaseEntry PadComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 1070", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "GeForce GTX 1080", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -287,6 +316,7 @@ const Database::DatabaseEntry PadComplexDouble = {
{ "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "TITAN X (Pascal)", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry PadtransposeHalf = {
"Padtranspose", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
@ -39,11 +45,14 @@ const Database::DatabaseEntry PadtransposeSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "ATI Radeon HD 6750M", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Ellesmere", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Turks", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
@ -55,8 +64,10 @@ const Database::DatabaseEntry PadtransposeSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
}
@ -83,6 +94,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1080", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@ -92,6 +104,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "TITAN X (Pascal)", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
@ -112,11 +125,14 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "ATI Radeon HD 6750M", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Ellesmere", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Turks", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
@ -128,10 +144,12 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
}
},
{ // Intel GPUs
@ -156,6 +174,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1080", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -165,6 +184,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "TITAN X (Pascal)", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -185,6 +205,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Ellesmere", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
@ -201,8 +222,10 @@ const Database::DatabaseEntry PadtransposeDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
@ -217,6 +240,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1080", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -226,6 +250,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
{ "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "TITAN X (Pascal)", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -233,7 +258,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
}
@ -246,6 +271,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Ellesmere", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
@ -262,10 +288,12 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
{ // Intel accelerators
@ -278,6 +306,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1070", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 1080", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
@ -287,6 +316,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "TITAN X (Pascal)", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry TransposeHalf = {
"Transpose", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
@ -26,7 +32,7 @@ const Database::DatabaseEntry TransposeHalf = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
}
},
}
@ -39,12 +45,15 @@ const Database::DatabaseEntry TransposeSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "ATI Radeon HD 6750M", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Ellesmere", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Oland", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Tonga", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Turks", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
{ // ARM GPUs
@ -55,8 +64,10 @@ const Database::DatabaseEntry TransposeSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
@ -83,6 +94,7 @@ const Database::DatabaseEntry TransposeSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "GeForce GTX 1080", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -92,6 +104,7 @@ const Database::DatabaseEntry TransposeSingle = {
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "TITAN X (Pascal)", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@ -112,12 +125,15 @@ const Database::DatabaseEntry TransposeComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "ATI Radeon HD 6750M", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Ellesmere", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Oland", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tonga", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Turks", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
{ // ARM GPUs
@ -128,8 +144,10 @@ const Database::DatabaseEntry TransposeComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
@ -150,6 +168,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 1070", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 1080", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@ -159,6 +178,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "TITAN X (Pascal)", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@ -166,7 +186,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
}
@ -179,6 +199,7 @@ const Database::DatabaseEntry TransposeDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Ellesmere", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Oland", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -195,10 +216,12 @@ const Database::DatabaseEntry TransposeDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",16} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
},
{ // Intel accelerators
@ -211,6 +234,7 @@ const Database::DatabaseEntry TransposeDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 1080", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -220,6 +244,7 @@ const Database::DatabaseEntry TransposeDouble = {
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "TITAN X (Pascal)", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -240,6 +265,7 @@ const Database::DatabaseEntry TransposeComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Ellesmere", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Oland", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@ -256,16 +282,19 @@ const Database::DatabaseEntry TransposeComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 1070", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 1080", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@ -275,6 +304,7 @@ const Database::DatabaseEntry TransposeComplexDouble = {
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "TITAN X (Pascal)", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry XaxpyHalf = {
"Xaxpy", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"VW",4}, {"WGS",128}, {"WPT",4} } },
{ "default", { {"VW",4}, {"WGS",128}, {"WPT",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -26,7 +32,7 @@ const Database::DatabaseEntry XaxpyHalf = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",8}, {"WGS",256}, {"WPT",4} } },
}
},
}
@ -39,12 +45,15 @@ const Database::DatabaseEntry XaxpySingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "ATI Radeon HD 6750M", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "Ellesmere", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
{ "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
{ "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
{ "default", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
{ "Turks", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -55,10 +64,12 @@ const Database::DatabaseEntry XaxpySingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
}
},
{ // Intel GPUs
@ -83,6 +94,7 @@ const Database::DatabaseEntry XaxpySingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
{ "GeForce GTX 1080", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 480", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@ -92,9 +104,10 @@ const Database::DatabaseEntry XaxpySingle = {
{ "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN Black", { {"VW",4}, {"WGS",128}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "TITAN X (Pascal)", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
}
},
{ // Default
@ -112,11 +125,14 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } },
{ "ATI Radeon HD 6750M", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Ellesmere", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
{ "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
{ "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
{ "Turks", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
@ -128,8 +144,10 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "default", { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
}
@ -156,6 +174,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "GeForce GTX 1080", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
@ -165,6 +184,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN Black", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "TITAN X (Pascal)", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
@ -185,6 +205,7 @@ const Database::DatabaseEntry XaxpyDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Ellesmere", { {"VW",2}, {"WGS",64}, {"WPT",4} } },
{ "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "Oland", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@ -201,10 +222,12 @@ const Database::DatabaseEntry XaxpyDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",8}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
{ "default", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
{ "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Intel accelerators
@ -217,6 +240,7 @@ const Database::DatabaseEntry XaxpyDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
{ "GeForce GTX 1080", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -226,14 +250,15 @@ const Database::DatabaseEntry XaxpyDouble = {
{ "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN Black", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "TITAN X (Pascal)", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
{ "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
}
},
}
@ -246,6 +271,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Ellesmere", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "Oland", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@ -262,8 +288,10 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
}
@ -278,6 +306,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "GeForce GTX 1080", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -287,6 +316,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
{ "GeForce GTX TITAN Black", { {"VW",1}, {"WGS",128}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "TITAN X (Pascal)", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -294,7 +324,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
}
},
}

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry XdotHalf = {
"Xdot", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"WGS1",256}, {"WGS2",64} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
@ -39,17 +45,22 @@ const Database::DatabaseEntry XdotSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } },
{ "ATI Radeon HD 6750M", { {"WGS1",256}, {"WGS2",32} } },
{ "Ellesmere", { {"WGS1",128}, {"WGS2",32} } },
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",128}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",64}, {"WGS2",32} } },
{ "Turks", { {"WGS1",128}, {"WGS2",64} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WGS2",32} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",1024}, {"WGS2",32} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
}
},
{ // Intel GPUs
@ -67,6 +78,7 @@ const Database::DatabaseEntry XdotSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",1024} } },
{ "GeForce GTX 1080", { {"WGS1",512}, {"WGS2",64} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",1024} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } },
@ -75,13 +87,14 @@ const Database::DatabaseEntry XdotSingle = {
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX TITAN Black", { {"WGS1",512}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "TITAN X (Pascal)", { {"WGS1",1024}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",256} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",256}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
}
@ -94,17 +107,22 @@ const Database::DatabaseEntry XdotComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
{ "ATI Radeon HD 6750M", { {"WGS1",256}, {"WGS2",256} } },
{ "Ellesmere", { {"WGS1",256}, {"WGS2",32} } },
{ "Oland", { {"WGS1",128}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",256}, {"WGS2",64} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
{ "Turks", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WGS2",64} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",1024}, {"WGS2",32} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
{ // Intel GPUs
@ -122,6 +140,7 @@ const Database::DatabaseEntry XdotComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 1080", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
@ -130,13 +149,14 @@ const Database::DatabaseEntry XdotComplexSingle = {
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "TITAN X (Pascal)", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",512}, {"WGS2",64} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",256}, {"WGS2",64} } },
{ "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
}
@ -149,6 +169,7 @@ const Database::DatabaseEntry XdotDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } },
{ "Ellesmere", { {"WGS1",128}, {"WGS2",64} } },
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
@ -158,14 +179,17 @@ const Database::DatabaseEntry XdotDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",64}, {"WGS2",128} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",64} } },
{ "default", { {"WGS1",512}, {"WGS2",64} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",512} } },
{ "GeForce GTX 1080", { {"WGS1",128}, {"WGS2",128} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
@ -174,8 +198,9 @@ const Database::DatabaseEntry XdotDouble = {
{ "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "TITAN X (Pascal)", { {"WGS1",128}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },
{ "default", { {"WGS1",128}, {"WGS2",128} } },
}
},
{ // Default
@ -193,6 +218,7 @@ const Database::DatabaseEntry XdotComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
{ "Ellesmere", { {"WGS1",256}, {"WGS2",32} } },
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
@ -202,14 +228,17 @@ const Database::DatabaseEntry XdotComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WGS2",128} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",1024}, {"WGS2",32} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 1080", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",128} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } },
@ -218,13 +247,14 @@ const Database::DatabaseEntry XdotComplexDouble = {
{ "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
{ "TITAN X (Pascal)", { {"WGS1",128}, {"WGS2",64} } },
{ "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",256}, {"WGS2",64} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
}

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry XgemmHalf = {
"Xgemm", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
@ -38,12 +44,15 @@ const Database::DatabaseEntry XgemmSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
{ "ATI Radeon HD 6750M", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",8} } },
{ "Ellesmere", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Turks", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
{ // ARM GPUs
@ -54,10 +63,12 @@ const Database::DatabaseEntry XgemmSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
{ // Intel GPUs
@ -82,6 +93,7 @@ const Database::DatabaseEntry XgemmSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "GeForce GTX 1080", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
@ -91,9 +103,10 @@ const Database::DatabaseEntry XgemmSingle = {
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
{ "TITAN X (Pascal)", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
}
},
{ // Default
@ -111,12 +124,15 @@ const Database::DatabaseEntry XgemmComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "ATI Radeon HD 6750M", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Ellesmere", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Oland", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "Turks", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
{ // ARM GPUs
@ -127,10 +143,12 @@ const Database::DatabaseEntry XgemmComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
}
},
{ // Intel GPUs
@ -155,6 +173,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 1080", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
@ -164,6 +183,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "TITAN X (Pascal)", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@ -184,6 +204,7 @@ const Database::DatabaseEntry XgemmDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Ellesmere", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
@ -200,10 +221,12 @@ const Database::DatabaseEntry XgemmDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
}
},
{ // Intel accelerators
@ -216,6 +239,7 @@ const Database::DatabaseEntry XgemmDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 1070", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "GeForce GTX 1080", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
@ -225,14 +249,15 @@ const Database::DatabaseEntry XgemmDouble = {
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "TITAN X (Pascal)", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
}
@ -245,12 +270,13 @@ const Database::DatabaseEntry XgemmComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "Ellesmere", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // ARM GPUs
@ -261,10 +287,12 @@ const Database::DatabaseEntry XgemmComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
}
},
{ // Intel accelerators
@ -277,6 +305,7 @@ const Database::DatabaseEntry XgemmComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 1070", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 1080", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@ -285,14 +314,15 @@ const Database::DatabaseEntry XgemmComplexDouble = {
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "TITAN X (Pascal)", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
}

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry XgemmDirectHalf = {
"XgemmDirect", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "default", { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
@ -25,7 +31,7 @@ const Database::DatabaseEntry XgemmDirectHalf = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
}
@ -38,8 +44,18 @@ const Database::DatabaseEntry XgemmDirectSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "ATI Radeon HD 6750M", { {"KWID",8}, {"MDIMAD",8}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "Ellesmere", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",32}, {"NDIMCD",32}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } },
{ "Tonga", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Turks", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",8}, {"WGD",64} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",64} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
}
},
{ // Intel GPUs
@ -51,9 +67,11 @@ const Database::DatabaseEntry XgemmDirectSingle = {
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 1080", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
{ "TITAN X (Pascal)", { {"KWID",8}, {"MDIMAD",32}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
}
},
{ // Default
@ -71,10 +89,19 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "ATI Radeon HD 6750M", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
{ "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "Turks", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
@ -84,14 +111,16 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 1080", { {"KWID",8}, {"MDIMAD",8}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "GeForce GTX 750 Ti", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "TITAN X (Pascal)", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
}
@ -104,20 +133,30 @@ const Database::DatabaseEntry XgemmDirectDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Ellesmere", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",32} } },
{ "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 1080", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "TITAN X (Pascal)", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
}
},
}
@ -130,20 +169,30 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Ellesmere", { {"KWID",16}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",0}, {"PADB",0}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",16} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 1080", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
{ "TITAN X (Pascal)", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
}

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry XgemvHalf = {
"Xgemv", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",256}, {"WPT1",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
@ -39,18 +45,23 @@ const Database::DatabaseEntry XgemvSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1} } },
{ "ATI Radeon HD 6750M", { {"WGS1",32}, {"WPT1",1} } },
{ "Ellesmere", { {"WGS1",256}, {"WPT1",1} } },
{ "Hawaii", { {"WGS1",128}, {"WPT1",1} } },
{ "Oland", { {"WGS1",128}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",128}, {"WPT1",2} } },
{ "Turks", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",4} } },
}
},
@ -62,7 +73,7 @@ const Database::DatabaseEntry XgemvSingle = {
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WPT1",1} } },
{ "Iris", { {"WGS1",64}, {"WPT1",2} } },
{ "Iris Pro", { {"WGS1",256}, {"WPT1",2} } },
{ "Iris Pro", { {"WGS1",128}, {"WPT1",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
@ -76,6 +87,7 @@ const Database::DatabaseEntry XgemvSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 1070", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 1080", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1} } },
@ -85,6 +97,7 @@ const Database::DatabaseEntry XgemvSingle = {
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN Black", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } },
{ "TITAN X (Pascal)", { {"WGS1",32}, {"WPT1",1} } },
{ "Tesla K20m", { {"WGS1",128}, {"WPT1",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",256}, {"WPT1",1} } },
@ -105,19 +118,24 @@ const Database::DatabaseEntry XgemvComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } },
{ "ATI Radeon HD 6750M", { {"WGS1",64}, {"WPT1",1} } },
{ "Ellesmere", { {"WGS1",32}, {"WPT1",1} } },
{ "Hawaii", { {"WGS1",64}, {"WPT1",1} } },
{ "Oland", { {"WGS1",64}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",64}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",64}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",32}, {"WPT1",1} } },
{ "Turks", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",2} } },
}
},
{ // Intel GPUs
@ -142,6 +160,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 1070", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 1080", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1} } },
@ -149,6 +168,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WPT1",1} } },
{ "TITAN X (Pascal)", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
@ -167,6 +187,7 @@ const Database::DatabaseEntry XgemvDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } },
{ "Ellesmere", { {"WGS1",32}, {"WPT1",1} } },
{ "Hawaii", { {"WGS1",128}, {"WPT1",1} } },
{ "Oland", { {"WGS1",256}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
@ -177,8 +198,10 @@ const Database::DatabaseEntry XgemvDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",4} } },
}
},
@ -192,6 +215,7 @@ const Database::DatabaseEntry XgemvDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 1070", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 1080", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1} } },
@ -201,6 +225,7 @@ const Database::DatabaseEntry XgemvDouble = {
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } },
{ "TITAN X (Pascal)", { {"WGS1",32}, {"WPT1",1} } },
{ "Tesla K20m", { {"WGS1",256}, {"WPT1",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
@ -221,6 +246,7 @@ const Database::DatabaseEntry XgemvComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1} } },
{ "Ellesmere", { {"WGS1",32}, {"WPT1",1} } },
{ "Hawaii", { {"WGS1",64}, {"WPT1",1} } },
{ "Oland", { {"WGS1",256}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
@ -231,8 +257,10 @@ const Database::DatabaseEntry XgemvComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",32}, {"WPT1",4} } },
{ "default", { {"WGS1",64}, {"WPT1",4} } },
}
},

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry XgemvFastHalf = {
"XgemvFast", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
@ -39,19 +45,24 @@ const Database::DatabaseEntry XgemvFastSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "ATI Radeon HD 6750M", { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
{ "Ellesmere", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tonga", { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
{ "Turks", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
{ "default", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",2}, {"WGS2",16}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
}
},
{ // Intel GPUs
@ -62,7 +73,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "Iris", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "Iris Pro", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "Iris Pro", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
{ "default", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
}
},
@ -76,6 +87,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 1070", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 1080", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@ -85,6 +97,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN Black", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "TITAN X (Pascal)", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tesla K20m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -105,18 +118,23 @@ const Database::DatabaseEntry XgemvFastComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "ATI Radeon HD 6750M", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Ellesmere", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tonga", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "Turks", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",4}, {"WGS2",16}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
}
},
@ -163,6 +181,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Ellesmere", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@ -173,8 +192,10 @@ const Database::DatabaseEntry XgemvFastDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
}
},
@ -188,6 +209,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 1070", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 1080", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@ -197,6 +219,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN Black", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "TITAN X (Pascal)", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
{ "Tesla K20m", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -217,6 +240,7 @@ const Database::DatabaseEntry XgemvFastComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Ellesmere", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
{ "Hawaii", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Oland", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@ -227,9 +251,11 @@ const Database::DatabaseEntry XgemvFastComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW2",4}, {"WGS2",32}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
{ "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW2",1}, {"WGS2",16}, {"WPT2",2} } },
{ "default", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
}
},
{ // Intel accelerators

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry XgemvFastRotHalf = {
"XgemvFastRot", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
@ -38,14 +44,19 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "ATI Radeon HD 6750M", { {"VW3",8}, {"WGS3",128}, {"WPT3",16} } },
{ "Ellesmere", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "Tonga", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
{ "Turks", { {"VW3",8}, {"WGS3",128}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
{ "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Intel GPUs
@ -54,21 +65,23 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
{ "Iris Pro", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "Iris Pro", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 1080", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX 750 Ti", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "GeForce GTX TITAN Black", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
{ "TITAN X (Pascal)", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
}
@ -81,14 +94,19 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "ATI Radeon HD 6750M", { {"VW3",8}, {"WGS3",32}, {"WPT3",8} } },
{ "Ellesmere", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
{ "Tonga", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "Turks", { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Intel GPUs
@ -103,7 +121,7 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
}
},
}
@ -116,21 +134,26 @@ const Database::DatabaseEntry XgemvFastRotDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "Ellesmere", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "Tonga", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
{ "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 1080", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX 750 Ti", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "GeForce GTX TITAN Black", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "TITAN X (Pascal)", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
}
},
@ -149,19 +172,22 @@ const Database::DatabaseEntry XgemvFastRotComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "Ellesmere", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "Tonga", { {"VW3",4}, {"WGS3",16}, {"WPT3",8} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
}
},
}

View File

@ -17,6 +17,12 @@ namespace database {
const Database::DatabaseEntry XgerHalf = {
"Xger", Precision::kHalf, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "Ellesmere", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
@ -26,7 +32,7 @@ const Database::DatabaseEntry XgerHalf = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",4}, {"WGS2",8}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
}
},
}
@ -39,12 +45,15 @@ const Database::DatabaseEntry XgerSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "ATI Radeon HD 6750M", { {"WGS1",16}, {"WGS2",16}, {"WPT",4} } },
{ "Ellesmere", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
{ "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "Oland", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "Tonga", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
{ "Turks", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",16}, {"WGS2",16}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -55,7 +64,9 @@ const Database::DatabaseEntry XgerSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",32}, {"WGS2",4}, {"WPT",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",4}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",128}, {"WGS2",8}, {"WPT",4} } },
}
@ -75,6 +86,7 @@ const Database::DatabaseEntry XgerSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
{ "GeForce GTX 1080", { {"WGS1",16}, {"WGS2",4}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
@ -82,6 +94,7 @@ const Database::DatabaseEntry XgerSingle = {
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "TITAN X (Pascal)", { {"WGS1",512}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
}
},
@ -100,12 +113,15 @@ const Database::DatabaseEntry XgerComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
{ "ATI Radeon HD 6750M", { {"WGS1",16}, {"WGS2",16}, {"WPT",1} } },
{ "Ellesmere", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
{ "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "Oland", { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "Tonga", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Turks", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -116,9 +132,11 @@ const Database::DatabaseEntry XgerComplexSingle = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",2}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",2}, {"WPT",4} } },
}
},
{ // Intel GPUs
@ -136,6 +154,7 @@ const Database::DatabaseEntry XgerComplexSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",16}, {"WGS2",64}, {"WPT",2} } },
{ "GeForce GTX 1080", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 670", { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
@ -143,6 +162,7 @@ const Database::DatabaseEntry XgerComplexSingle = {
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "TITAN X (Pascal)", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
}
},
@ -161,12 +181,13 @@ const Database::DatabaseEntry XgerDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "Ellesmere", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
{ "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "Oland", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "Tonga", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
{ "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -177,15 +198,18 @@ const Database::DatabaseEntry XgerDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",256}, {"WGS2",4}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
{ "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
{ "GeForce GTX 1080", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
@ -193,7 +217,8 @@ const Database::DatabaseEntry XgerDouble = {
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
{ "TITAN X (Pascal)", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
}
},
{ // Default
@ -211,6 +236,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Ellesmere", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
{ "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
{ "Oland", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
@ -227,7 +253,9 @@ const Database::DatabaseEntry XgerComplexDouble = {
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz", { {"WGS1",128}, {"WGS2",4}, {"WPT",4} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
{ "Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz", { {"WGS1",512}, {"WGS2",2}, {"WPT",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
}
@ -236,6 +264,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GRID K520", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 1070", { {"WGS1",8}, {"WGS2",128}, {"WPT",1} } },
{ "GeForce GTX 1080", { {"WGS1",8}, {"WGS2",4}, {"WPT",1} } },
{ "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 670", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
@ -243,6 +272,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "TITAN X (Pascal)", { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
{ "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
}
},

View File

@ -0,0 +1,78 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file populates the database with best-found tuning parameters for the 'Xtrsv' kernels.
//
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry XtrsvHalf = {
"Xtrsv", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XtrsvSingle = {
"Xtrsv", Precision::kSingle, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XtrsvComplexSingle = {
"Xtrsv", Precision::kComplexSingle, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XtrsvDouble = {
"Xtrsv", Precision::kDouble, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XtrsvComplexDouble = {
"Xtrsv", Precision::kComplexDouble, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRSV_BLOCK_SIZE",32} } },
}
},
}
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -31,9 +31,7 @@ R"(
// Enable support for double-precision
#if PRECISION == 64 || PRECISION == 6464
#if __OPENCL_VERSION__ <= CL_VERSION_1_1
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
// Half-precision
@ -71,7 +69,7 @@ R"(
// Complex single-precision
#elif PRECISION == 3232
typedef struct cfloat {float x; float y;} real;
typedef float2 real;
typedef struct cfloat2 {real x; real y;} real2;
typedef struct cfloat4 {real x; real y; real z; real w;} real4;
typedef struct cfloat8 {real s0; real s1; real s2; real s3;
@ -86,7 +84,7 @@ R"(
// Complex double-precision
#elif PRECISION == 6464
typedef struct cdouble {double x; double y;} real;
typedef double2 real;
typedef struct cdouble2 {real x; real y;} real2;
typedef struct cdouble4 {real x; real y; real z; real w;} real4;
typedef struct cdouble8 {real s0; real s1; real s2; real s3;
@ -162,6 +160,13 @@ R"(
#define AbsoluteValue(value) value = fabs(value)
#endif
// Negation (component-wise)
#if PRECISION == 3232 || PRECISION == 6464
#define Negate(value) value.x = -(value.x); value.y = -(value.y)
#else
#define Negate(value) value = -(value)
#endif
// Adds two complex variables
#if PRECISION == 3232 || PRECISION == 6464
#define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
@ -169,6 +174,13 @@ R"(
#define Add(c, a, b) c = a + b
#endif
// Subtracts two complex variables
#if PRECISION == 3232 || PRECISION == 6464
#define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y
#else
#define Subtract(c, a, b) c = a - b
#endif
// Multiply two complex variables (used in the defines below)
#if PRECISION == 3232 || PRECISION == 6464
#define MulReal(a, b) a.x*b.x - a.y*b.y
@ -193,6 +205,20 @@ R"(
#endif
#endif
// The scalar multiply-subtract function
#if PRECISION == 3232 || PRECISION == 6464
#define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b)
#else
#define MultiplySubtract(c, a, b) c -= a * b
#endif
// The scalar division function: full division
#if PRECISION == 3232 || PRECISION == 6464
#define DivideFull(c, a, b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom
#else
#define DivideFull(c, a, b) c = a / b
#endif
// The scalar AXPBY function
#if PRECISION == 3232 || PRECISION == 6464
#define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)

View File

@ -9,7 +9,7 @@
//
// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
// support vector data-types.
// support vector data-types. The general version has a batched implementation as well.
//
// This kernel uses the level-1 BLAS common tuning parameters.
//
@ -36,14 +36,31 @@ void Xaxpy(const int n, const real_arg arg_alpha,
}
}
// =================================================================================================
// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
// assumes that 'n' is dividable by 'VW' and 'WPT'.
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void XaxpyFaster(const int n, const real_arg arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
const real alpha = GetRealArg(arg_alpha);
if (get_global_id(0) < n / (VW)) {
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id = w*get_global_size(0) + get_global_id(0);
realV xvalue = xgm[id];
realV yvalue = ygm[id];
ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
}
}
}
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void XaxpyFast(const int n, const real_arg arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
void XaxpyFastest(const int n, const real_arg arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
const real alpha = GetRealArg(arg_alpha);
#pragma unroll
@ -57,6 +74,24 @@ void XaxpyFast(const int n, const real_arg arg_alpha,
// =================================================================================================
// Full version of the kernel with offsets and strided accesses: batched version
__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
void XaxpyBatched(const int n, const __constant real_arg* arg_alphas,
const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc,
__global real* ygm, const __constant int* y_offsets, const int y_inc) {
const int batch = get_group_id(1);
const real alpha = GetRealArg(arg_alphas[batch]);
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
real xvalue = xgm[id*x_inc + x_offsets[batch]];
MultiplyAdd(ygm[id*y_inc + y_offsets[batch]], alpha, xvalue);
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"

View File

@ -0,0 +1,144 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
#if defined(ROUTINE_TRSV)
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void FillVector(const int n, const int inc, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);
const int tid = get_global_id(0);
if (tid < n) {
dest[tid*inc + offset] = value;
}
}
// =================================================================================================
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef TRSV_BLOCK_SIZE
#define TRSV_BLOCK_SIZE 32 // The block size for forward or backward substition
#endif
// =================================================================================================
__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
void trsv_forward(int n,
const __global real *A, const int a_offset, int a_ld,
__global real *b, const int b_offset, int b_inc,
__global real *x, const int x_offset, int x_inc,
const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
__local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
__local real xlm[TRSV_BLOCK_SIZE];
const int tid = get_local_id(0);
// Pre-loads the data into local memory
if (tid < n) {
Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
if (is_transposed == 0) {
for (int i = 0; i < n; ++i) {
alm[i][tid] = A[i + tid*a_ld + a_offset];
}
}
else {
for (int i = 0; i < n; ++i) {
alm[i][tid] = A[tid + i*a_ld + a_offset];
}
}
if (do_conjugate) {
for (int i = 0; i < n; ++i) {
COMPLEX_CONJUGATE(alm[i][tid]);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Computes the result (single-threaded for now)
if (tid == 0) {
for (int i = 0; i < n; ++i) {
for (int j = 0; j < i; ++j) {
MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
}
if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Stores the results
if (tid < n) {
x[tid*x_inc + x_offset] = xlm[tid];
}
}
__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
void trsv_backward(int n,
const __global real *A, const int a_offset, int a_ld,
__global real *b, const int b_offset, int b_inc,
__global real *x, const int x_offset, int x_inc,
const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
__local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
__local real xlm[TRSV_BLOCK_SIZE];
const int tid = get_local_id(0);
// Pre-loads the data into local memory
if (tid < n) {
Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
if (is_transposed == 0) {
for (int i = 0; i < n; ++i) {
alm[i][tid] = A[i + tid*a_ld + a_offset];
}
}
else {
for (int i = 0; i < n; ++i) {
alm[i][tid] = A[tid + i*a_ld + a_offset];
}
}
if (do_conjugate) {
for (int i = 0; i < n; ++i) {
COMPLEX_CONJUGATE(alm[i][tid]);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Computes the result (single-threaded for now)
if (tid == 0) {
for (int i = n - 1; i >= 0; --i) {
for (int j = i + 1; j < n; ++j) {
MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
}
if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Stores the results
if (tid < n) {
x[tid*x_inc + x_offset] = xlm[tid];
}
}
#endif
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View File

@ -24,16 +24,14 @@ R"(
// Copies a matrix from source to destination. The output is padded with zero values in case the
// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
// value and offset can be different.
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void CopyPadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int do_conjugate) {
const real alpha = GetRealArg(arg_alpha);
inline void _CopyPadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real alpha,
const int do_conjugate) {
// Loops over the work per thread in both dimensions
#pragma unroll
@ -60,22 +58,36 @@ void CopyPadMatrix(const int src_one, const int src_two,
}
}
// Interface to the above function
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void CopyPadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int do_conjugate) {
const real alpha = GetRealArg(arg_alpha);
_CopyPadMatrix(src_one, src_two, src_ld, src_offset, src,
dest_one, dest_two, dest_ld, dest_offset, dest,
alpha, do_conjugate);
}
// =================================================================================================
// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
// writes only the actual data back to the destination matrix. Again, the ld value and offset can
// be different.
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void CopyMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
const real alpha = GetRealArg(arg_alpha);
inline void _CopyMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
// Loops over the work per thread in both dimensions
#pragma unroll
@ -105,6 +117,62 @@ void CopyMatrix(const int src_one, const int src_two,
}
}
// Interface to the above function
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void CopyMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
const real alpha = GetRealArg(arg_alpha);
_CopyMatrix(src_one, src_two, src_ld, src_offset, src,
dest_one, dest_two, dest_ld, dest_offset, dest,
alpha, upper, lower, diagonal_imag_zero);
}
// =================================================================================================
#if defined(ROUTINE_GEMMBATCHED)
// Batched version of the above
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void CopyPadMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const __constant int* dest_offsets,
__global real* dest,
const int do_conjugate) {
const int batch = get_group_id(2);
const int src_offset = src_offsets[batch];
const int dest_offset = dest_offsets[batch];
real alpha; SetToOne(alpha);
_CopyPadMatrix(src_one, src_two, src_ld, src_offset, src,
dest_one, dest_two, dest_ld, dest_offset, dest,
alpha, do_conjugate);
}
// Batched version of the above
__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
void CopyMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const __constant int* dest_offsets,
__global real* dest) {
const int batch = get_group_id(2);
const int src_offset = src_offsets[batch];
const int dest_offset = dest_offsets[batch];
real alpha; SetToOne(alpha);
_CopyMatrix(src_one, src_two, src_ld, src_offset, src,
dest_one, dest_two, dest_ld, dest_offset, dest,
alpha, 0, 0, 0);
}
#endif
// =================================================================================================
// End of the C++11 raw string literal

View File

@ -0,0 +1,431 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
// and Jack Dongarra.
//
// =================================================================================================
//
// Let A be an block_size*block_size lower triangular matrix, and B its inverse.
// Then the block decomposition
//
// [ A11 0 ] * [ B11 0 ] = [ I 0 ]
// [ A21 A22 ] [ B21 B22 ] [ 0 I ]
//
// yields
//
// A11*B11 = I ==> B11 = A11^{-1},
// A22*B22 = I ==> B22 = A22^{-1},
// A21*B11 + A22*B21 = 0 ==> B21 = -A22^{-1}*A21*B11 = -B22*A21*B11.
//
// The InvertDiagonalBlock kernel inverts A11 and A22.
// The TripleMatMul routines multiply:
// part 1: B21 = A21 * B11,
// part 2: B21 = -B22 * B21.
//
// At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each
// submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2
// at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block,
// here current_size*2=32,
// [ B11 0 ]
// which contains [ B21 B22 ].
// Outer blocks are block_size x block_size.
//
// A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on
// the right. This makes a single check easy to do.
//
// B is stored in workspace that is a full multiple of block_size x block_size; no checks needed.
//
// We split this into part1 & part2 to synchronize all blocks and make sure
// that writes to B12 are observed by all blocks.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
#if defined(ROUTINE_INVERT)
#define LOCALX 17 // 16 + 1 to avoid bank conflicts
#define LOCALY 16
// =================================================================================================
// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
__global real* restrict dest, const int outer_block_size,
const int unit_diagonal, const int is_upper)
{
const int thread_index = get_local_id(0);
const int block_index = get_group_id(0);
// Sets the offset for this particular block in the source and destination matrices
const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
(block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
// Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
__local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
// Loads the source lower triangle into local memory. Any values in the upper triangle or
// outside of the matrix are set to zero
#pragma unroll
for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
(thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
if (condition) {
lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
}
else {
SetToZero(lm[thread_index][j]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Inverts the diagonal
real inverted_diagonal;
SetToOne(inverted_diagonal);
if (unit_diagonal == 0) {
const real diagonal_value = lm[thread_index][thread_index];
if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
real constant_one;
SetToOne(constant_one);
DivideFull(inverted_diagonal, constant_one, diagonal_value);
}
}
lm[thread_index][thread_index] = inverted_diagonal;
barrier(CLK_LOCAL_MEM_FENCE);
// Upper-triangular
if (is_upper) {
// Computes the elements 0:j-1 of the j-th column
for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) {
if (thread_index < j) {
real sum;
SetToZero(sum);
#pragma unroll
for (int k = 0; k < j; ++k) {
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
}
real diagonal_value = lm[j][j];
Negate(diagonal_value);
Multiply(lm[thread_index][j], diagonal_value, sum);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Lower triangular
else {
// Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column
for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) {
if (thread_index > j) {
real sum;
SetToZero(sum);
#pragma unroll
for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
}
real diagonal_value = lm[j][j];
Negate(diagonal_value);
Multiply(lm[thread_index][j], diagonal_value, sum);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Writes the result to global memory
#pragma unroll
for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
}
}
// =================================================================================================
// Triple matrix-multiplication kernel: C = A * B
inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
__global const real* agm, __global const real* bgm, __global real* cgm,
const int lda, const int ldb, const int ldc,
int current_size, int num_pages, const int block_size) {
// Emulates a 3D grid: NX * (NY * num_pages)
const int by = get_group_id(1) / num_pages;
const int page = get_group_id(1) % num_pages;
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1));
const int iby = by*16;
const int id = lidx + lidy*get_local_size(0);
const int row = page*current_size*2 + current_size + ibx + id;
int col = page*current_size*2 + current_size;
// Sets the offsets for this specific thread
agm += ibx + id;
bgm += lidx + (iby + lidy)*ldb;
cgm += ibx + id + iby*ldc;
// Initializes the result registers
real cpm[16];
#pragma unroll
for (int j = 0; j < 16; ++j) {
SetToZero(cpm[j]);
}
// Computes NT x 16 block of C, each thread computes one 1 x 16 row
for (int k = 0; k < current_size; k += 16) {
// Loads a 16 x 16 block of B into local memory using NX x 4 threads
#pragma unroll
for( int i=0; i < 16; i += (size/4) ) { // += get_local_size(0)
#pragma unroll
for( int j=0; j < 16; j += 4 ) { // += get_local_size(1)
blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Upper triangular
if (upper) {
// Performs 16 x 16 multiply-add operations
#pragma unroll
for (int i = 0; i < 16; ++i) {
if (part == 2 || col++ < n) {
#pragma unroll
for (int j = 0; j < 16; ++j) {
MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
}
}
}
}
// Lower triangular
else {
if (row < n) {
// Performs 16 x 16 multiply-add operations
#pragma unroll
for (int i = 0; i < 16; ++i) {
#pragma unroll
for (int j = 0; j < 16; ++j) {
MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores NT x 16 results: each thread writes one 16 x 1 row
#pragma unroll
for (int i = 0; i < 16; ++i) {
if (part == 2) { Negate(cpm[i]); }
cgm[0] = cpm[i];
cgm += ldc;
}
}
// =================================================================================================
// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
__global const real* src, const int a_offset, const int lda,
__global real* dest, int current_size, int num_pages, const int block_size) {
// Emulates a 3D grid: NX * (NY * num_pages)
const int page = get_group_id(1) % num_pages;
// Computes the destination block offset:
// - go to the (page / pages_per_block) outer block_size * block_size block
// - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
const int pages_per_block = block_size / (current_size*2);
dest += (page / pages_per_block) * block_size * block_size +
(page % pages_per_block) * (current_size*2*block_size + current_size*2);
// Using the GEMM notation: C = A*B
__global const real* agm;
__global const real* bgm;
__global real* cgm;
if (upper) { // upper triangular: B12 = A12 * B22
agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda; // A12
bgm = dest + current_size*block_size + current_size; // B22
cgm = dest + current_size*block_size; // B12
}
else { // lower triangular: B21 = A21 * B11
agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size; // A21
bgm = dest; // B11
cgm = dest + current_size; // B21
}
// Runs the generic C = A * B matrix multiplication
const int ldb = block_size;
const int ldc = block_size;
TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
}
// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
__global real* dest, int current_size, int num_pages, const int block_size) {
// Emulates a 3D grid: NX * (NY * num_pages)
const int page = get_group_id(1) % num_pages;
// Computes the destination block offset:
// - go to the (page / pages_per_block) outer block_size * block_size block
// - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
const int pages_per_block = block_size / (current_size*2);
dest += (page / pages_per_block) * block_size * block_size +
(page % pages_per_block) * (current_size*2*block_size + current_size*2);
// Using the GEMM notation: C = A*B
__global const real* agm;
__global const real* bgm;
__global real* cgm;
if (upper) { // upper triangular: B12 = -B11 * B12
agm = dest; // B11
cgm = dest + current_size*block_size; // B12
bgm = cgm; // B12, okay to overwrite
}
else { // lower triangular: B21 = -B22 * B21
agm = dest + current_size*block_size + current_size; // B22
cgm = dest + current_size; // B21
bgm = cgm; // B21, okay to overwrite
}
// Runs the generic C = A * B matrix multiplication
const int lda = block_size;
const int ldb = block_size;
const int ldc = block_size;
TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
}
// =================================================================================================
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
}
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
}
// B21 = A21 * B11
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B21 = -B22 * B21
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
}
// =================================================================================================
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
}
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
}
// B12 = A12 * B22
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
}
// B12 = -B11 * B12
__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
}
#endif
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View File

@ -73,6 +73,22 @@ R"(
#define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts
#endif
// =================================================================================================
#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
void FillMatrix(const int m, const int n, const int ld, const int offset,
__global real* restrict dest, const real_arg arg_value) {
const real value = GetRealArg(arg_value);
const int id_one = get_global_id(0);
const int id_two = get_global_id(1);
if (id_one < m && id_two < n) {
dest[id_two*ld + id_one + offset] = value;
}
}
#endif
// =================================================================================================
// End of the C++11 raw string literal

View File

@ -24,19 +24,15 @@ R"(
// Transposes a matrix from source to destination. The output is padded with zero values in case the
// destination matrix dimensions are larger than the transposed source matrix dimensions.
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
void TransposePadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int do_conjugate) {
const real alpha = GetRealArg(arg_alpha);
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
inline void _TransposePadMatrix(__local real* tile,
const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real alpha,
const int do_conjugate) {
// Loop over the work per thread
#pragma unroll
@ -56,7 +52,9 @@ void TransposePadMatrix(const int src_one, const int src_two,
if (id_src_two < src_two && id_src_one < src_one) {
value = src[id_src_two*src_ld + id_src_one + src_offset];
}
tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
const int tile_id0 = get_local_id(0)*PADTRA_WPT + w_one;
const int tile_id1 = get_local_id(1)*PADTRA_WPT + w_two;
tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0] = value;
}
}
@ -75,7 +73,9 @@ void TransposePadMatrix(const int src_one, const int src_two,
// Stores the transposed value in the destination matrix
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
const int tile_id0 = get_local_id(1)*PADTRA_WPT + w_one;
const int tile_id1 = get_local_id(0)*PADTRA_WPT + w_two;
real value = tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0];
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
}
@ -83,25 +83,38 @@ void TransposePadMatrix(const int src_one, const int src_two,
}
}
// Interface to the above function
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
void TransposePadMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int do_conjugate) {
const real alpha = GetRealArg(arg_alpha);
__local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
_TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset, src,
dest_one, dest_two, dest_ld, dest_offset, dest,
alpha, do_conjugate);
}
// =================================================================================================
// Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
// padded source matrix, but only the actual data is written back to the transposed destination
// matrix. This kernel optionally checks for upper/lower triangular matrices.
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
void TransposeMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
const real alpha = GetRealArg(arg_alpha);
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
inline void _TransposeMatrix(__local real* tile,
const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
// Loop over the work per thread
#pragma unroll
@ -117,7 +130,9 @@ void TransposeMatrix(const int src_one, const int src_two,
// Loads data into the local memory if the thread IDs are within bounds of the source matrix.
if ((id_src_one < src_one) && (id_src_two < src_two)) {
real value = src[id_src_two*src_ld + id_src_one + src_offset];
tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value;
const int tile_id0 = get_local_id(0)*PADTRA_WPT + w_one;
const int tile_id1 = get_local_id(1)*PADTRA_WPT + w_two;
tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0] = value;
}
}
}
@ -145,7 +160,9 @@ void TransposeMatrix(const int src_one, const int src_two,
// Stores the transposed value in the destination matrix
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
const int tile_id0 = get_local_id(1)*PADTRA_WPT + w_one;
const int tile_id1 = get_local_id(0)*PADTRA_WPT + w_two;
real value = tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0];
if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
}
@ -154,6 +171,65 @@ void TransposeMatrix(const int src_one, const int src_two,
}
}
// Interface to the above function
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
void TransposeMatrix(const int src_one, const int src_two,
const int src_ld, const int src_offset,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
const real_arg arg_alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
const real alpha = GetRealArg(arg_alpha);
__local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
_TransposeMatrix(tile, src_one, src_two, src_ld, src_offset, src,
dest_one, dest_two, dest_ld, dest_offset, dest,
alpha, upper, lower, diagonal_imag_zero);
}
// =================================================================================================
#if defined(ROUTINE_GEMMBATCHED)
// Batched version of the above
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
void TransposePadMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const __constant int* dest_offsets,
__global real* dest,
const int do_conjugate) {
const int batch = get_group_id(2);
const int src_offset = src_offsets[batch];
const int dest_offset = dest_offsets[batch];
real alpha; SetToOne(alpha);
__local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
_TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset, src,
dest_one, dest_two, dest_ld, dest_offset, dest,
alpha, do_conjugate);
}
// Batched version of the above
__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
void TransposeMatrixBatched(const int src_one, const int src_two,
const int src_ld, const __constant int* src_offsets,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const __constant int* dest_offsets,
__global real* dest) {
const int batch = get_group_id(2);
const int src_offset = src_offsets[batch];
const int dest_offset = dest_offsets[batch];
real alpha; SetToOne(alpha);
__local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)];
_TransposeMatrix(tile, src_one, src_two, src_ld, src_offset, src,
dest_one, dest_two, dest_ld, dest_offset, dest,
alpha, 0, 0, 0);
}
#endif
// =================================================================================================
// End of the C++11 raw string literal

View File

@ -0,0 +1,70 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the batched version of the non-direct GEMM kernel. See part 1 for information
// about the non-batched version of the kernel.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Main entry point of the kernel. This is the regular full version.
__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas,
const __constant real_arg* arg_betas,
const __global realM* restrict agm, const int a_one, const int a_two,
const __global realN* restrict bgm, const int b_one, const int b_two,
__global realM* cgm, const int c_one, const int c_two) {
const int batch = get_group_id(2);
const real alpha = GetRealArg(arg_alphas[batch]);
const real beta = GetRealArg(arg_betas[batch]);
// Sets the offsets
const int a_offset = batch * a_one * a_two;
const int b_offset = batch * b_one * b_two;
const int c_offset = batch * c_one * c_two;
const __global realM* restrict agm_ = &agm[a_offset / VWM];
const __global realN* restrict bgm_ = &bgm[b_offset / VWN];
__global realM* restrict cgm_ = &cgm[c_offset / VWM];
// Allocates workgroup-private memory (local memory)
#if SA == 1
__local realM alm[KWG * MWG/VWM];
#endif
#if SB == 1
__local realN blm[KWG * NWG/VWN];
#endif
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm, alm, blm);
#elif SA == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm, alm);
#elif SB == 1
XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm, blm);
#else
XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
StoreResults(cgm_, cpm, kSizeM, alpha, beta);
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View File

@ -0,0 +1,110 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the batched version of the direct GEMM kernels. See part 1 for information
// about the non-batched version of the kernel.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, non-transposed]
__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
__kernel void XgemmDirectBatchedNN(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
__global real* cgm, const __constant int* c_offsets, const int c_ld,
const int c_transpose, const int a_conjugate, const int b_conjugate) {
const int batch = get_group_id(2);
const real_arg arg_alpha = arg_alphas[batch];
const real_arg arg_beta = arg_betas[batch];
const int a_offset = a_offsets[batch];
const int b_offset = b_offsets[batch];
const int c_offset = c_offsets[batch];
__local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)];
XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
}
// Direct version of the batched GEMM kernel with [A, B] = [non-transposed, transposed]
__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
__kernel void XgemmDirectBatchedNT(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
__global real* cgm, const __constant int* c_offsets, const int c_ld,
const int c_transpose, const int a_conjugate, const int b_conjugate) {
const int batch = get_group_id(2);
const real_arg arg_alpha = arg_alphas[batch];
const real_arg arg_beta = arg_betas[batch];
const int a_offset = a_offsets[batch];
const int b_offset = b_offsets[batch];
const int c_offset = c_offsets[batch];
__local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)];
XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
}
// Direct version of the batched GEMM kernel with [A, B] = [transposed, non-transposed]
__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
__kernel void XgemmDirectBatchedTN(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
__global real* cgm, const __constant int* c_offsets, const int c_ld,
const int c_transpose, const int a_conjugate, const int b_conjugate) {
const int batch = get_group_id(2);
const real_arg arg_alpha = arg_alphas[batch];
const real_arg arg_beta = arg_betas[batch];
const int a_offset = a_offsets[batch];
const int b_offset = b_offsets[batch];
const int c_offset = c_offsets[batch];
__local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)];
XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
}
// Direct version of the batched GEMM kernel with [A, B] = [transposed, transposed]
__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
__kernel void XgemmDirectBatchedTT(const int kSizeM, const int kSizeN, const int kSizeK,
const __constant real_arg* arg_alphas, const __constant real_arg* arg_betas,
const __global realMD* restrict agm, const __constant int* a_offsets, const int a_ld,
const __global realND* restrict bgm, const __constant int* b_offsets, const int b_ld,
__global real* cgm, const __constant int* c_offsets, const int c_ld,
const int c_transpose, const int a_conjugate, const int b_conjugate) {
const int batch = get_group_id(2);
const real_arg arg_alpha = arg_alphas[batch];
const real_arg arg_beta = arg_betas[batch];
const int a_offset = a_offsets[batch];
const int b_offset = b_offsets[batch];
const int c_offset = c_offsets[batch];
__local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)];
XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View File

@ -42,7 +42,7 @@ inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local re
int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
const realMD avec = agm[idk*(a_ld/VWMD) + idm + (a_offset/VWMD)];
#if VWMD == 1
alm[kg*(WGD + PADA) + mg] = avec;
#elif VWMD == 2
@ -113,7 +113,7 @@ inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local re
int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
const realND bvec = bgm[idk*(b_ld/VWND) + idn + (b_offset/VWND)];
#if VWND == 1
blm[kg*(WGD + PADB) + ng] = bvec;
#elif VWND == 2

View File

@ -53,13 +53,13 @@ inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
// Loads data: off-chip --> local (matrix A and B)
if (a_ld % VWMD == 0) {
if (a_ld % VWMD == 0 && a_offset % VWMD == 0) {
GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
}
else {
GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
}
if (b_ld % VWND == 0) {
if (b_ld % VWND == 0 && b_offset % VWND == 0) {
GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
}
else {

View File

@ -21,22 +21,75 @@
namespace clblast {
// =================================================================================================
// For each kernel this map contains a list of routines it is used in
const std::vector<std::string> Routine::routines_axpy = {"AXPY", "COPY", "SCAL", "SWAP"};
const std::vector<std::string> Routine::routines_dot = {"AMAX", "ASUM", "DOT", "DOTC", "DOTU", "MAX", "MIN", "NRM2", "SUM"};
const std::vector<std::string> Routine::routines_ger = {"GER", "GERC", "GERU", "HER", "HER2", "HPR", "HPR2", "SPR", "SPR2", "SYR", "SYR2"};
const std::vector<std::string> Routine::routines_gemv = {"GBMV", "GEMV", "HBMV", "HEMV", "HPMV", "SBMV", "SPMV", "SYMV", "TMBV", "TPMV", "TRMV", "TRSV"};
const std::vector<std::string> Routine::routines_gemm = {"GEMM", "HEMM", "SYMM", "TRMM"};
const std::vector<std::string> Routine::routines_gemm_syrk = {"GEMM", "HEMM", "HER2K", "HERK", "SYMM", "SYR2K", "SYRK", "TRMM", "TRSM"};
const std::vector<std::string> Routine::routines_trsm = {"TRSM"};
const std::unordered_map<std::string, const std::vector<std::string>> Routine::routines_by_kernel = {
{"Xaxpy", routines_axpy},
{"Xdot", routines_dot},
{"Xgemv", routines_gemv},
{"XgemvFast", routines_gemv},
{"XgemvFastRot", routines_gemv},
{"Xtrsv", routines_gemv},
{"Xger", routines_ger},
{"Copy", routines_gemm_syrk},
{"Pad", routines_gemm_syrk},
{"Transpose", routines_gemm_syrk},
{"Padtranspose", routines_gemm_syrk},
{"Xgemm", routines_gemm_syrk},
{"XgemmDirect", routines_gemm},
{"KernelSelection", routines_gemm},
{"Invert", routines_trsm},
};
// =================================================================================================
// The constructor does all heavy work, errors are returned as exceptions
Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
const std::vector<const Database::DatabaseEntry*> &userDatabase,
const std::vector<std::string> &kernel_names, const Precision precision,
const std::vector<Database::DatabaseEntry> &userDatabase,
std::initializer_list<const char *> source):
precision_(precision),
routine_name_(name),
kernel_names_(kernel_names),
queue_(queue),
event_(event),
context_(queue_.GetContext()),
device_(queue_.GetDevice()),
device_name_(device_.Name()),
db_(queue_, routines, precision_, userDatabase) {
db_(kernel_names) {
InitDatabase(userDatabase);
InitProgram(source);
}
void Routine::InitDatabase(const std::vector<Database::DatabaseEntry> &userDatabase) {
for (const auto &kernel_name : kernel_names_) {
// Queries the cache to see whether or not the kernel parameter database is already there
bool has_db;
db_(kernel_name) = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision_, device_name_, kernel_name },
&has_db);
if (has_db) { continue; }
// Builds the parameter database for this device and routine set and stores it in the cache
db_(kernel_name) = Database(device_, kernel_name, precision_, userDatabase);
DatabaseCache::Instance().Store(DatabaseKey{ precision_, device_name_, kernel_name },
Database{ db_(kernel_name) });
}
}
void Routine::InitProgram(std::initializer_list<const char *> source) {
// Queries the cache to see whether or not the program (context-specific) is already there
if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }
bool has_program;
program_ = ProgramCache::Instance().Get(ProgramKeyRef{ context_(), precision_, routine_name_ },
&has_program);
if (has_program) { return; }
// Sets the build options from an environmental variable (if set)
auto options = std::vector<std::string>();
@ -47,33 +100,36 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
program.Build(device_, options);
StoreProgramToCache(program, context_, precision_, routine_name_);
bool has_binary;
auto binary = BinaryCache::Instance().Get(BinaryKeyRef{ precision_, routine_name_, device_name_ },
&has_binary);
if (has_binary) {
program_ = Program(device_, context_, binary);
program_.Build(device_, options);
ProgramCache::Instance().Store(ProgramKey{ context_(), precision_, routine_name_ },
Program{ program_ });
return;
}
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
// program will be added to the cache.
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
const auto extensions = device_.Capabilities();
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
}
if ((precision_ == Precision::kDouble && !PrecisionSupported<double>(device_)) ||
(precision_ == Precision::kComplexDouble && !PrecisionSupported<double2>(device_))) {
throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
}
// As above, but for cl_khr_fp16 (half precision)
if (precision_ == Precision::kHalf) {
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
}
if (precision_ == Precision::kHalf && !PrecisionSupported<half>(device_)) {
throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
}
// Collects the parameters for this device in the form of defines, and adds the precision
auto source_string = db_.GetDefines();
auto source_string = std::string{""};
for (const auto &kernel_name : kernel_names_) {
source_string += db_(kernel_name).GetDefines();
}
source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
// Adds the name of the routine as a define
@ -114,21 +170,23 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
#endif
// Compiles the kernel
auto program = Program(context_, source_string);
program_ = Program(context_, source_string);
try {
program.Build(device_, options);
program_.Build(device_, options);
} catch (const CLError &e) {
if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
fprintf(stdout, "OpenCL compiler error/warning: %s\n",
program.GetBuildInfo(device_).c_str());
program_.GetBuildInfo(device_).c_str());
}
throw;
}
// Store the compiled binary and program in the cache
const auto binary = program.GetIR();
StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
StoreProgramToCache(program, context_, precision_, routine_name_);
BinaryCache::Instance().Store(BinaryKey{ precision_, routine_name_, device_name_ },
program_.GetIR());
ProgramCache::Instance().Store(ProgramKey{ context_(), precision_, routine_name_ },
Program{ program_ });
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE

View File

@ -18,6 +18,7 @@
#include <string>
#include <vector>
#include <unordered_map>
#include "utilities/utilities.hpp"
#include "cache.hpp"
@ -35,18 +36,39 @@ class Routine {
// Base class constructor. The user database is an optional extra database to override the
// built-in database.
// All heavy preparation work is done inside this constructor.
// NOTE: the caller must provide the same userDatabase for each combination of device, precision
// and routine list, otherwise the caching logic will break.
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
const std::vector<const Database::DatabaseEntry*> &userDatabase,
const std::vector<Database::DatabaseEntry> &userDatabase,
std::initializer_list<const char *> source);
// List of kernel-routine look-ups
static const std::vector<std::string> routines_axpy;
static const std::vector<std::string> routines_dot;
static const std::vector<std::string> routines_ger;
static const std::vector<std::string> routines_gemv;
static const std::vector<std::string> routines_gemm;
static const std::vector<std::string> routines_gemm_syrk;
static const std::vector<std::string> routines_trsm;
static const std::unordered_map<std::string, const std::vector<std::string>> routines_by_kernel;
private:
// Initializes program_, fetching cached program or building one
void InitProgram(std::initializer_list<const char *> source);
// Initializes db_, fetching cached database or building one
void InitDatabase(const std::vector<Database::DatabaseEntry> &userDatabase);
protected:
// Non-static variable for the precision
const Precision precision_;
// The routine's name
// The routine's name and the corresponding kernels
const std::string routine_name_;
const std::vector<std::string> kernel_names_;
// The OpenCL objects, accessible only from derived classes
Queue queue_;
@ -57,8 +79,11 @@ class Routine {
// OpenCL device properties
const std::string device_name_;
// Compiled program (either retrieved from cache or compiled in slow path)
Program program_;
// Connection to the database for all the device-specific parameters
const Database db_;
Databases db_;
};
// =================================================================================================

View File

@ -19,8 +19,8 @@
#include <string>
#include <vector>
#include "clblast.h"
#include "clpp11.hpp"
#include "clblast.h"
#include "database/database.hpp"
namespace clblast {
@ -33,11 +33,52 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// =================================================================================================
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
const Program &program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t m, const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
const T constant_value) {
auto kernel = Kernel(program, "FillMatrix");
kernel.SetArgument(0, static_cast<int>(m));
kernel.SetArgument(1, static_cast<int>(n));
kernel.SetArgument(2, static_cast<int>(ld));
kernel.SetArgument(3, static_cast<int>(offset));
kernel.SetArgument(4, dest());
kernel.SetArgument(5, GetRealArg(constant_value));
auto local = std::vector<size_t>{8, 8};
auto global = std::vector<size_t>{Ceil(m, 8), Ceil(n, 8)};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
// Sets all elements of a vector to a constant value
template <typename T>
void FillVector(Queue &queue, const Device &device,
const Program &program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t inc, const size_t offset,
const Buffer<T> &dest,
const T constant_value) {
auto kernel = Kernel(program, "FillVector");
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, static_cast<int>(inc));
kernel.SetArgument(2, static_cast<int>(offset));
kernel.SetArgument(3, dest());
kernel.SetArgument(4, GetRealArg(constant_value));
auto local = std::vector<size_t>{64};
auto global = std::vector<size_t>{Ceil(n, 64)};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
// to write to symmetric and triangular matrices through optional arguments.
template <typename T>
void PadCopyTransposeMatrix(Queue &queue, const Device &device,
const Database &db,
const Databases &db,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
@ -155,6 +196,70 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
}
// Batched version of the above
template <typename T>
void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device,
const Databases &db,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const Buffer<int> &src_offsets,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const Buffer<int> &dest_offsets,
const Buffer<T> &dest,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const size_t batch_count) {
// Determines the right kernel
auto kernel_name = std::string{};
if (do_transpose) {
kernel_name = (do_pad) ? "TransposePadMatrixBatched" : "TransposeMatrixBatched";
}
else {
kernel_name = (do_pad) ? "CopyPadMatrixBatched" : "CopyMatrixBatched";
}
// Retrieves the kernel from the compiled binary
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(src_one));
kernel.SetArgument(1, static_cast<int>(src_two));
kernel.SetArgument(2, static_cast<int>(src_ld));
kernel.SetArgument(3, src_offsets());
kernel.SetArgument(4, src());
kernel.SetArgument(5, static_cast<int>(dest_one));
kernel.SetArgument(6, static_cast<int>(dest_two));
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, dest_offsets());
kernel.SetArgument(9, dest());
if (do_pad) {
kernel.SetArgument(10, static_cast<int>(do_conjugate));
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
// parameters in the database.
if (do_transpose) {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
batch_count
};
const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"], 1};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]),
batch_count
};
const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"], 1};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
}
// =================================================================================================
} // namespace clblast

View File

@ -43,9 +43,8 @@ void Xamax<T>::DoAmax(const size_t n,
TestVectorIndex(1, imax_buffer, imax_offset);
// Retrieves the Xamax kernels from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xamax");
auto kernel2 = Kernel(program, "XamaxEpilogue");
auto kernel1 = Kernel(program_, "Xamax");
auto kernel2 = Kernel(program_, "XamaxEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];

View File

@ -43,9 +43,8 @@ void Xasum<T>::DoAsum(const size_t n,
TestVectorScalar(1, asum_buffer, asum_offset);
// Retrieves the Xasum kernels from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xasum");
auto kernel2 = Kernel(program, "XasumEpilogue");
auto kernel1 = Kernel(program_, "Xasum");
auto kernel2 = Kernel(program_, "XasumEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];

View File

@ -44,19 +44,21 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
(y_offset == 0) && (y_inc == 1) &&
IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) &&
(y_offset == 0) && (y_inc == 1) &&
IsMultiple(n, db_["WPT"]*db_["VW"]);
const auto use_fastest_kernel = use_faster_kernel &&
IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
// If possible, run the fast-version of the kernel
auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
const auto kernel_name = (use_fastest_kernel) ? "XaxpyFastest" :
(use_faster_kernel) ? "XaxpyFaster" : "Xaxpy";
// Retrieves the Xaxpy kernel from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
if (use_faster_kernel || use_fastest_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
@ -74,13 +76,18 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
}
// Launches the kernel
if (use_fast_kernel) {
if (use_fastest_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else if (use_faster_kernel) {
auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);

View File

@ -52,8 +52,7 @@ void Xcopy<T>::DoCopy(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";
// Retrieves the Xcopy kernel from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {

View File

@ -46,9 +46,8 @@ void Xdot<T>::DoDot(const size_t n,
TestVectorScalar(1, dot_buffer, dot_offset);
// Retrieves the Xdot kernels from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xdot");
auto kernel2 = Kernel(program, "XdotEpilogue");
auto kernel1 = Kernel(program_, "Xdot");
auto kernel2 = Kernel(program_, "XdotEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];

View File

@ -43,9 +43,8 @@ void Xnrm2<T>::DoNrm2(const size_t n,
TestVectorScalar(1, nrm2_buffer, nrm2_offset);
// Retrieves the Xnrm2 kernels from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xnrm2");
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
auto kernel1 = Kernel(program_, "Xnrm2");
auto kernel2 = Kernel(program_, "Xnrm2Epilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];

View File

@ -49,8 +49,7 @@ void Xscal<T>::DoScal(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";
// Retrieves the Xscal kernel from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {

View File

@ -52,8 +52,7 @@ void Xswap<T>::DoSwap(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";
// Retrieves the Xswap kernel from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {

View File

@ -22,9 +22,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
Routine(queue, event, name, {"Xgemv", "XgemvFast", "XgemvFastRot", "Xtrsv"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/xgemv.opencl"
#include "../../kernels/level2/xgemv_fast.opencl"
#include "../../kernels/level2/xtrsv.opencl"
}) {
}
@ -69,14 +70,14 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrix has an alternative layout (row or column-major).
auto a_altlayout = (layout == Layout::kRowMajor);
const auto a_altlayout = (layout == Layout::kRowMajor);
auto a_one = (a_altlayout) ? n : m;
auto a_two = (a_altlayout) ? m : n;
const auto a_two = (a_altlayout) ? m : n;
// Swap m and n if the matrix is transposed
auto a_transposed = (a_transpose != Transpose::kNo);
auto m_real = (a_transposed) ? n : m;
auto n_real = (a_transposed) ? m : n;
const auto a_transposed = (a_transpose != Transpose::kNo);
const auto m_real = (a_transposed) ? n : m;
const auto n_real = (a_transposed) ? m : n;
// Special adjustments for banded matrices
if (kl != 0 || ku != 0) {
@ -84,10 +85,10 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
}
// Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
auto a_rotated = a_transposed ^ a_altlayout;
const auto a_rotated = a_transposed ^ a_altlayout;
// In case of complex data-types, the transpose can also become a conjugate transpose
auto a_conjugate = (a_transpose == Transpose::kConjugate);
const auto a_conjugate = (a_transpose == Transpose::kConjugate);
// Tests the matrix and the vectors for validity
if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
@ -106,8 +107,8 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
IsMultiple(a_ld, db_["VW3"]);
// If possible, run the fast-version (rotated or non-rotated) of the kernel
auto kernel_name = "Xgemv";
auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
auto kernel_name = std::string{"Xgemv"};
const auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
auto global_size = m_ceiled / db_["WPT1"];
auto local_size = db_["WGS1"];
if (fast_kernel) {
@ -122,8 +123,7 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
}
// Retrieves the Xgemv kernel from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_real));

View File

@ -53,8 +53,7 @@ void Xger<T>::DoGer(const Layout layout,
TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xger");
auto kernel = Kernel(program_, "Xger");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(a_one));

View File

@ -67,8 +67,7 @@ void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const auto matching_alpha = GetAlpha(alpha);
// Retrieves the kernel from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher");
auto kernel = Kernel(program_, "Xher");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));

View File

@ -54,8 +54,7 @@ void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher2");
auto kernel = Kernel(program_, "Xher2");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));

View File

@ -52,9 +52,9 @@ void Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
auto fast_kernels = false;
try {
MatVec(layout, a_transpose,
n, n, static_cast<T>(1),
n, n, ConstantOne<T>(),
a_buffer, a_offset, a_ld,
scratch_buffer, x_offset, x_inc, static_cast<T>(0),
scratch_buffer, x_offset, x_inc, ConstantZero<T>(),
x_buffer, x_offset, x_inc,
fast_kernels, fast_kernels,
parameter, false, k, 0);

View File

@ -52,9 +52,9 @@ void Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
auto fast_kernels = false;
try {
MatVec(layout, a_transpose,
n, n, static_cast<T>(1),
n, n, ConstantOne<T>(),
ap_buffer, ap_offset, n,
scratch_buffer, x_offset, x_inc, static_cast<T>(0),
scratch_buffer, x_offset, x_inc, ConstantZero<T>(),
x_buffer, x_offset, x_inc,
fast_kernels, fast_kernels,
parameter, true, 0, 0);

View File

@ -52,9 +52,9 @@ void Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
auto fast_kernels = false;
try {
MatVec(layout, a_transpose,
n, n, static_cast<T>(1),
n, n, ConstantOne<T>(),
a_buffer, a_offset, a_ld,
scratch_buffer, x_offset, x_inc, static_cast<T>(0),
scratch_buffer, x_offset, x_inc, ConstantZero<T>(),
x_buffer, x_offset, x_inc,
fast_kernels, fast_kernels,
parameter, false, 0, 0);

View File

@ -0,0 +1,161 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xtrsv class (see the header for information about the class).
//
// =================================================================================================
#include "routines/level2/xtrsv.hpp"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
// =================================================================================================
template <typename T>
void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };
// Translates CLBlast arguments to 0/1 integers for the OpenCL kernel
const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
(a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0;
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
(triangle == Triangle::kLower && a_transpose != Transpose::kNo));
// Retrieves the kernel from the compiled binary
const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, a_buffer());
kernel.SetArgument(2, static_cast<int>(a_offset));
kernel.SetArgument(3, static_cast<int>(a_ld));
kernel.SetArgument(4, b_buffer());
kernel.SetArgument(5, static_cast<int>(b_offset));
kernel.SetArgument(6, static_cast<int>(b_inc));
kernel.SetArgument(7, x_buffer());
kernel.SetArgument(8, static_cast<int>(x_offset));
kernel.SetArgument(9, static_cast<int>(x_inc));
kernel.SetArgument(10, static_cast<int>(is_transposed));
kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
kernel.SetArgument(12, static_cast<int>(do_conjugate));
// Launches the kernel
const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
const auto global = std::vector<size_t>{1};
auto event = Event();
RunKernel(kernel, queue_, device_, global, local, event.pointer());
event.WaitForCompletion();
}
// =================================================================================================
// The main routine
template <typename T>
void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the matrix and vector
TestMatrixA(n, n, a_buffer, a_offset, a_ld);
TestVectorX(n, b_buffer, b_offset, b_inc);
// Creates a copy of B to avoid overwriting input while computing output
// TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels
const auto x_offset = b_offset;
const auto x_inc = b_inc;
const auto x_size = n*x_inc + x_offset;
auto x_buffer = Buffer<T>(context_, x_size);
b_buffer.CopyTo(queue_, x_size, x_buffer);
// Fills the output buffer with zeros
auto eventWaitList = std::vector<Event>();
auto fill_vector_event = Event();
FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList,
n, x_inc, x_offset, x_buffer, ConstantZero<T>());
fill_vector_event.WaitForCompletion();
// Derives properties based on the arguments
const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
(triangle == Triangle::kLower && a_transpose != Transpose::kNo));
const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) ||
(layout != Layout::kColMajor && a_transpose != Transpose::kNo));
// Loops over the blocks
auto col = n; // the initial column position
for (auto i = size_t{0}; i < n; i += db_["TRSV_BLOCK_SIZE"]) {
const auto block_size = std::min(db_["TRSV_BLOCK_SIZE"], n - i);
// Sets the next column position
col = (is_upper) ? col - block_size : i;
// Sets the offsets for upper or lower triangular
const auto extra_offset_a = (is_transposed) ?
(is_upper ? col + (col+block_size)*a_ld : col) :
(is_upper ? col+block_size + col*a_ld : col*a_ld);
const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0;
const auto extra_offset_b = col*x_inc;
// Runs the GEMV routine to compute x' = A * x
if (i > 0) {
const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i;
const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size;
DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
a_buffer, a_offset + extra_offset_a, a_ld,
x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
x_buffer, x_offset + extra_offset_b, x_inc );
}
// Runs the triangular substitution for the block size
Substitution(layout, triangle, a_transpose, diagonal, block_size,
a_buffer, a_offset + col + col*a_ld, a_ld,
b_buffer, b_offset + col*b_inc, b_inc,
x_buffer, x_offset + col*x_inc, x_inc);
}
// Retrieves the results
x_buffer.CopyTo(queue_, x_size, b_buffer);
}
// =================================================================================================
// Compiles the templated class
template class Xtrsv<half>;
template class Xtrsv<float>;
template class Xtrsv<double>;
template class Xtrsv<float2>;
template class Xtrsv<double2>;
// =================================================================================================
} // namespace clblast

View File

@ -0,0 +1,60 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular
// forward and backward substitutions on the diagonal parts of the matrix in combination with larger
// GEMV computation on the remainder of the matrix.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XTRSV_H_
#define CLBLAST_ROUTINES_XTRSV_H_
#include "routines/level2/xgemv.hpp"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xtrsv: public Xgemv<T> {
public:
// Uses the generic matrix-vector routine
using Xgemv<T>::queue_;
using Xgemv<T>::context_;
using Xgemv<T>::device_;
using Xgemv<T>::db_;
using Xgemv<T>::program_;
using Xgemv<T>::DoGemv;
// Constructor
Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV");
// Templated-precision implementation of the routine
void DoTrsv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
// Performs forward or backward substitution on a small triangular matrix
void Substitution(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XTRSV_H_
#endif

View File

@ -33,10 +33,11 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/convert_symmetric.opencl"
#include "../../kernels/level3/convert_triangular.opencl"
#include "../../kernels/level3/convert_hermitian.opencl"
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_direct_part1.opencl"
#include "../../kernels/level3/xgemm_direct_part2.opencl"
#include "../../kernels/level3/xgemm_direct_part3.opencl"
, // separated in two parts to prevent C1091 in MSVC 2013
, // separated in multiple parts to prevent C1091 in MSVC 2013
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
#include "../../kernels/level3/xgemm_part3.opencl"
@ -103,19 +104,19 @@ void Xgemm<T>::DoGemm(const Layout layout,
// Selects which version of GEMM to run
const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
if (do_gemm_direct) { // for small sizes (single kernel)
return GemmDirect(m, n, k, alpha,
a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
c_buffer, c_offset, c_ld,
a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
GemmDirect(m, n, k, alpha,
a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
c_buffer, c_offset, c_ld,
a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
}
else { // for larger sizes (pre/post-processing plus a very fast kernel)
return GemmIndirect(m, n, k, alpha,
a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
c_buffer, c_offset, c_ld,
a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
a_one, a_two, a_want_rotated,
b_one, b_two, b_want_rotated,
c_one, c_two, c_want_rotated);
GemmIndirect(m, n, k, alpha,
a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
c_buffer, c_offset, c_ld,
a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
a_one, a_two, a_want_rotated,
b_one, b_two, b_want_rotated,
c_one, c_two, c_want_rotated);
}
}
@ -126,16 +127,16 @@ void Xgemm<T>::DoGemm(const Layout layout,
// overhead of these extra kernels might not be ideal for certain devices/arguments.
template <typename T>
void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
const bool a_conjugate, const bool b_conjugate,
const size_t a_one, const size_t a_two, const bool a_want_rotated,
const size_t b_one, const size_t b_two, const bool b_want_rotated,
const size_t c_one, const size_t c_two, const bool c_want_rotated) {
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
const bool a_conjugate, const bool b_conjugate,
const size_t a_one, const size_t a_two, const bool a_want_rotated,
const size_t b_one, const size_t b_two, const bool b_want_rotated,
const size_t c_one, const size_t c_two, const bool c_want_rotated) {
// Calculates the ceiled versions of m, n, and k
const auto m_ceiled = Ceil(m, db_["MWG"]);
const auto n_ceiled = Ceil(n, db_["NWG"]);
@ -150,9 +151,6 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
a_do_transpose == false && a_conjugate == false;
@ -178,7 +176,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
a_one_i, a_two_i, a_one_i, 0, a_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, a_do_transpose, a_conjugate);
eventWaitList.push_back(eventProcessA);
}
@ -189,7 +187,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
b_one, b_two, b_ld, b_offset, b_buffer,
b_one_i, b_two_i, b_one_i, 0, b_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, b_do_transpose, b_conjugate);
eventWaitList.push_back(eventProcessB);
}
@ -200,13 +198,13 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
c_one, c_two, c_ld, c_offset, c_buffer,
c_one_i, c_two_i, c_one_i, 0, c_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, c_do_transpose, false);
eventWaitList.push_back(eventProcessC);
}
// Retrieves the Xgemm kernel from the compiled binary
auto kernel = Kernel(program, "Xgemm");
auto kernel = Kernel(program_, "Xgemm");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_ceiled));
@ -236,7 +234,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
c_one_i, c_two_i, c_one_i, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
false, c_do_transpose, false);
}
}
@ -247,21 +245,18 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
template <typename T>
void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
const bool a_conjugate, const bool b_conjugate) {
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
const bool a_conjugate, const bool b_conjugate) {
// Retrieves the proper XgemmDirect kernel from the compiled binary
const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
(b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
auto kernel = Kernel(program, name);
auto kernel = Kernel(program_, name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m));

View File

@ -58,8 +58,7 @@ void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle trian
// Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
// routine afterwards
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program_, kernel_name);
// Sets the arguments for the hermitian-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));

View File

@ -30,6 +30,7 @@ class Xhemm: public Xgemm<T> {
using Xgemm<T>::queue_;
using Xgemm<T>::context_;
using Xgemm<T>::device_;
using Xgemm<T>::program_;
using Xgemm<T>::db_;
using Xgemm<T>::DoGemm;

View File

@ -81,9 +81,6 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Determines whether or not temporary matrices are needed
auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
ab_rotated == false && ab_conjugate == false;
@ -116,7 +113,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessA1);
}
@ -125,7 +122,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessA2);
}
@ -134,7 +131,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessB1);
}
@ -143,7 +140,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessB2);
}
@ -154,12 +151,12 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
@ -201,7 +198,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
false, c_rotated, false, upper, lower, true);
}

View File

@ -79,9 +79,6 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
// Decides which kernel to run: the upper-triangular or lower-triangular version
auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
// Determines whether or not temporary matrices are needed
auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
a_rotated == false && a_conjugate == false;
@ -109,7 +106,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, a_rotated, a_conjugate);
eventWaitList.push_back(eventProcessA);
}
@ -118,7 +115,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, a_rotated, b_conjugate);
eventWaitList.push_back(eventProcessB);
}
@ -129,12 +126,12 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
// Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program_, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
@ -163,7 +160,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
ConstantOne<T>(), program,
ConstantOne<T>(), program_,
false, c_rotated, false, upper, lower, true);
}

View File

@ -30,12 +30,12 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); }
@ -58,8 +58,7 @@ void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle trian
// Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
// routine afterwards
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program_, kernel_name);
// Sets the arguments for the symmetric-to-squared kernel
kernel.SetArgument(0, static_cast<int>(k));

Some files were not shown because too many files have changed in this diff Show More