mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-08-27 23:37:02 +02:00
Merge pull request #31 from CNugteren/development
Update to version 0.6.0
This commit is contained in:
commit
d190becd89
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -2,3 +2,4 @@ build
|
||||||
stash
|
stash
|
||||||
.*
|
.*
|
||||||
*.pyc
|
*.pyc
|
||||||
|
*.db
|
68
.travis.yml
68
.travis.yml
|
@ -1,29 +1,69 @@
|
||||||
language: cpp
|
language: cpp
|
||||||
|
sudo: required
|
||||||
|
dist: trusty
|
||||||
|
|
||||||
compiler:
|
compiler:
|
||||||
- gcc
|
- gcc
|
||||||
- clang
|
- clang
|
||||||
|
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
sources:
|
||||||
|
# kubuntu-backports contains newer versions of cmake to install
|
||||||
|
- kubuntu-backports
|
||||||
|
packages:
|
||||||
|
- cmake
|
||||||
|
|
||||||
|
env:
|
||||||
|
global:
|
||||||
|
- CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/make/release
|
||||||
|
- OPENCL_REGISTRY=https://www.khronos.org/registry/cl
|
||||||
|
- OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
|
- cmake --version;
|
||||||
- sudo add-apt-repository -y ppa:kalakris/cmake
|
- ${CC} --version;
|
||||||
- sudo apt-get update -qq
|
- ${CXX} --version;
|
||||||
- sudo apt-get install -qq gcc-4.8 g++-4.8 clang
|
|
||||||
- sudo apt-get install -qq fglrx=2:8.960-0ubuntu1 opencl-headers
|
|
||||||
- sudo apt-get install -qq cmake
|
|
||||||
install:
|
install:
|
||||||
- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
|
# The following linux logic is necessary because of Travis's move to the GCE platform, which does not
|
||||||
|
# currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221
|
||||||
|
# We build our own linkable .so file
|
||||||
|
- if [ ${TRAVIS_OS_NAME} == "linux" ]; then
|
||||||
|
mkdir -p ${OPENCL_ROOT};
|
||||||
|
pushd ${OPENCL_ROOT};
|
||||||
|
travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git;
|
||||||
|
mv ./OpenCL-ICD-Loader/* .;
|
||||||
|
travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL;
|
||||||
|
pushd inc/CL;
|
||||||
|
travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp;
|
||||||
|
popd;
|
||||||
|
mkdir -p lib;
|
||||||
|
pushd lib;
|
||||||
|
cmake -G "Unix Makefiles" ..;
|
||||||
|
make;
|
||||||
|
cp ./bin/libOpenCL.so .;
|
||||||
|
popd;
|
||||||
|
pushd inc/CL;
|
||||||
|
travis_retry git fetch origin opencl12:opencl12;
|
||||||
|
git checkout opencl12;
|
||||||
|
popd;
|
||||||
|
mv inc/ include/;
|
||||||
|
popd;
|
||||||
|
fi
|
||||||
|
|
||||||
before_script:
|
before_script:
|
||||||
- mkdir install
|
- mkdir -p ${CLBLAST_ROOT}
|
||||||
- export PATH=`pwd`/install/bin:${PATH}
|
- pushd ${CLBLAST_ROOT}
|
||||||
- export LD_LIBRARY_PATH=`pwd`/install/lib64:`pwd`/install/lib:${LD_LIBRARY_PATH}
|
- cmake -DOPENCL_ROOT=${OPENCL_ROOT} ${TRAVIS_BUILD_DIR}
|
||||||
- mkdir build
|
|
||||||
- cd build
|
|
||||||
- cmake -DCMAKE_INSTALL_PREFIX:PATH=../install ..
|
|
||||||
script:
|
script:
|
||||||
- make
|
- make
|
||||||
- make install
|
|
||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
- master
|
- master
|
||||||
- development
|
- development
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
email: false
|
email: false
|
||||||
|
|
17
CHANGELOG
17
CHANGELOG
|
@ -1,4 +1,21 @@
|
||||||
|
|
||||||
|
Version 0.6.0
|
||||||
|
- Added support for MSVC (Visual Studio) 2015
|
||||||
|
- Added tuned parameters for various devices (see README)
|
||||||
|
- Now automatically generates C++ code from JSON tuning results
|
||||||
|
- Added level-2 routines:
|
||||||
|
* SGER/DGER
|
||||||
|
* CGERU/ZGERU
|
||||||
|
* CGERC/ZGERC
|
||||||
|
* CHER/ZHER
|
||||||
|
* CHPR/ZHPR
|
||||||
|
* CHER2/ZHER2
|
||||||
|
* CHPR2/ZHPR2
|
||||||
|
* CSYR/ZSYR
|
||||||
|
* CSPR/ZSPR
|
||||||
|
* CSYR2/ZSYR2
|
||||||
|
* CSPR2/ZSPR2
|
||||||
|
|
||||||
Version 0.5.0
|
Version 0.5.0
|
||||||
- Improved structure and performance of level-2 routines (xSYMV/xHEMV)
|
- Improved structure and performance of level-2 routines (xSYMV/xHEMV)
|
||||||
- Reduced compilation time of level-3 OpenCL kernels
|
- Reduced compilation time of level-3 OpenCL kernels
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
cmake_minimum_required(VERSION 2.8.10)
|
cmake_minimum_required(VERSION 2.8.10)
|
||||||
project("clblast" C CXX)
|
project("clblast" C CXX)
|
||||||
set(clblast_VERSION_MAJOR 0)
|
set(clblast_VERSION_MAJOR 0)
|
||||||
set(clblast_VERSION_MINOR 5)
|
set(clblast_VERSION_MINOR 6)
|
||||||
set(clblast_VERSION_PATCH 0)
|
set(clblast_VERSION_PATCH 0)
|
||||||
|
|
||||||
# Options and their default values
|
# Options and their default values
|
||||||
|
@ -55,16 +55,21 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# C++ compiler settings
|
# C++ compiler settings
|
||||||
set(FLAGS "-O3 -std=c++11")
|
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||||
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
set(FLAGS "/Ox")
|
||||||
set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
|
set(FLAGS "${FLAGS} /wd4715")
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
|
else ()
|
||||||
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
|
set(FLAGS "-O3 -std=c++11")
|
||||||
|
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
||||||
|
set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
|
||||||
|
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
|
||||||
|
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
|
||||||
|
endif()
|
||||||
|
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
||||||
|
set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
|
||||||
|
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
|
||||||
|
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
|
||||||
endif()
|
endif()
|
||||||
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
|
||||||
set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
|
|
||||||
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
|
|
||||||
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
|
|
||||||
endif()
|
endif()
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
|
||||||
|
|
||||||
|
@ -102,14 +107,15 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
|
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
|
||||||
set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm)
|
set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
|
||||||
set(SAMPLE_PROGRAMS_CPP sgemm)
|
set(SAMPLE_PROGRAMS_CPP sgemm)
|
||||||
set(SAMPLE_PROGRAMS_C sgemm)
|
set(SAMPLE_PROGRAMS_C sgemm)
|
||||||
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
|
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
|
||||||
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv)
|
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
|
||||||
|
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
|
||||||
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
|
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
|
||||||
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
|
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
|
||||||
set(PRECISIONS 32 3232 64 6464)
|
set(PRECISIONS 32 64 3232 6464)
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
|
|
92
README.md
92
README.md
|
@ -6,7 +6,7 @@ CLBlast: The tuned OpenCL BLAS library
|
||||||
|
|
||||||
CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
|
CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
|
||||||
|
|
||||||
__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
|
__Note that the CLBlast library is actively being developed, and might not be mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details (and how to tune yourself).
|
||||||
|
|
||||||
|
|
||||||
Why CLBlast and not clBLAS or cuBLAS?
|
Why CLBlast and not clBLAS or cuBLAS?
|
||||||
|
@ -17,6 +17,9 @@ Use CLBlast instead of clBLAS:
|
||||||
* When you care about achieving maximum performance.
|
* When you care about achieving maximum performance.
|
||||||
* When you want to be able to inspect the BLAS kernels or easily customize them to your needs.
|
* When you want to be able to inspect the BLAS kernels or easily customize them to your needs.
|
||||||
* When you run on exotic OpenCL devices which you need to tune yourself.
|
* When you run on exotic OpenCL devices which you need to tune yourself.
|
||||||
|
* When you are still running on OpenCL 1.1 hardware.
|
||||||
|
* When you value an organized and modern C++ codebase.
|
||||||
|
* When you target Intel CPUs and GPUs or embedded devices
|
||||||
|
|
||||||
Use CLBlast instead of cuBLAS:
|
Use CLBlast instead of cuBLAS:
|
||||||
|
|
||||||
|
@ -41,10 +44,13 @@ The pre-requisites for compilation of CLBlast are:
|
||||||
- Clang 3.3 or newer
|
- Clang 3.3 or newer
|
||||||
- AppleClang 5.0 or newer
|
- AppleClang 5.0 or newer
|
||||||
- ICC 14.0 or newer
|
- ICC 14.0 or newer
|
||||||
|
- MSVC (Visual Studio) 2015 or newer
|
||||||
* An OpenCL 1.1 or newer library, for example:
|
* An OpenCL 1.1 or newer library, for example:
|
||||||
- Apple OpenCL
|
- Apple OpenCL
|
||||||
- NVIDIA CUDA SDK
|
- NVIDIA CUDA SDK
|
||||||
- AMD APP SDK
|
- AMD APP SDK
|
||||||
|
- Intel OpenCL
|
||||||
|
- Beignet
|
||||||
|
|
||||||
An example of an out-of-source build (starting from the root of the CLBlast folder):
|
An example of an out-of-source build (starting from the root of the CLBlast folder):
|
||||||
|
|
||||||
|
@ -79,13 +85,27 @@ Using the tuners (optional)
|
||||||
The CLBlast library will be tuned in the future for the most commonly used OpenCL devices. This pre-release of CLBlast is only tuned for a limited number of devices, in particular those with the following `CL_DEVICE_NAME` values:
|
The CLBlast library will be tuned in the future for the most commonly used OpenCL devices. This pre-release of CLBlast is only tuned for a limited number of devices, in particular those with the following `CL_DEVICE_NAME` values:
|
||||||
|
|
||||||
* NVIDIA GPUs:
|
* NVIDIA GPUs:
|
||||||
- GeForce GTX480
|
- GeForce GTX 480
|
||||||
|
- GeForce GTX 680
|
||||||
|
- GeForce GTX 750 Ti
|
||||||
|
- GeForce GTX 980
|
||||||
|
- GeForce GTX Titan
|
||||||
|
- GeForce GTX Titan X
|
||||||
- Tesla K20m
|
- Tesla K20m
|
||||||
- Tesla K40m
|
- Tesla K40m
|
||||||
* AMD GPUs:
|
* AMD GPUs:
|
||||||
- Tahiti
|
- Tahiti
|
||||||
|
- R9 M370X
|
||||||
* Intel GPUs:
|
* Intel GPUs:
|
||||||
- Iris
|
- Iris
|
||||||
|
- Iris Pro
|
||||||
|
* Intel CPUs:
|
||||||
|
- Core i5-6200U
|
||||||
|
- Core i7-3770K
|
||||||
|
- Core i7-5930K
|
||||||
|
* Other devices:
|
||||||
|
- ARM Mali-T628 GPU
|
||||||
|
- Intel MIC
|
||||||
|
|
||||||
If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners:
|
If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners:
|
||||||
|
|
||||||
|
@ -93,9 +113,19 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
|
||||||
|
|
||||||
Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 1.7.0 or higher). CLTune is available from GitHub.
|
Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 1.7.0 or higher). CLTune is available from GitHub.
|
||||||
|
|
||||||
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance.
|
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
|
||||||
|
|
||||||
The tuner will output a C++ database compatible line with the results, which can be added to `include/internal/database/xxxxx.h` in the appropriate section. Or, if tuning parameters already exist for your device but you believe they can be improved, this is also the place where they can be modified. If you want the found parameters to be included in future releases of CLBlast, please post the JSON output in the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
|
The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.h` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
|
||||||
|
|
||||||
|
In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):
|
||||||
|
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -DTUNERS=ON ..
|
||||||
|
make
|
||||||
|
make alltuners
|
||||||
|
python ../scripts/database/database.py . ..
|
||||||
|
make
|
||||||
|
|
||||||
|
|
||||||
Compiling the tests (optional)
|
Compiling the tests (optional)
|
||||||
|
@ -127,10 +157,11 @@ These graphs can be generated automatically on your own device. First, compile C
|
||||||
|
|
||||||
Rscript path/to/test/performance/graphs/xgemm.r 0 1
|
Rscript path/to/test/performance/graphs/xgemm.r 0 1
|
||||||
|
|
||||||
|
|
||||||
Supported routines
|
Supported routines
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
CLBlast is in active development but already supports the majority of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
|
CLBlast is in active development but already supports almost all the BLAS routines. The currently supported routines are marked with '✔' in the following tables. Empty boxes represent routines that still need to be implemented in a future release, whereas routines marked with '-' are not part of BLAS at all.
|
||||||
|
|
||||||
| Level-1 | S | D | C | Z | Notes |
|
| Level-1 | S | D | C | Z | Notes |
|
||||||
| ---------|---|---|---|---|---------|
|
| ---------|---|---|---|---|---------|
|
||||||
|
@ -149,7 +180,6 @@ CLBlast is in active development but already supports the majority of BLAS routi
|
||||||
| xASUM | | | - | - | +SC +DZ |
|
| xASUM | | | - | - | +SC +DZ |
|
||||||
| IxAMAX | | | | | |
|
| IxAMAX | | | | | |
|
||||||
|
|
||||||
|
|
||||||
| Level-2 | S | D | C | Z | Notes |
|
| Level-2 | S | D | C | Z | Notes |
|
||||||
| ---------|---|---|---|---|---------|
|
| ---------|---|---|---|---|---------|
|
||||||
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
|
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
|
||||||
|
@ -166,17 +196,17 @@ CLBlast is in active development but already supports the majority of BLAS routi
|
||||||
| xTRSV | | | | | |
|
| xTRSV | | | | | |
|
||||||
| xTBSV | | | | | |
|
| xTBSV | | | | | |
|
||||||
| xTPSV | | | | | |
|
| xTPSV | | | | | |
|
||||||
| xGER | | | - | - | |
|
| xGER | ✔ | ✔ | - | - | |
|
||||||
| xGERU | - | - | | | |
|
| xGERU | - | - | ✔ | ✔ | |
|
||||||
| xGERC | - | - | | | |
|
| xGERC | - | - | ✔ | ✔ | |
|
||||||
| xHER | - | - | | | |
|
| xHER | - | - | ✔ | ✔ | |
|
||||||
| xHPR | - | - | | | |
|
| xHPR | - | - | ✔ | ✔ | |
|
||||||
| xHER2 | - | - | | | |
|
| xHER2 | - | - | ✔ | ✔ | |
|
||||||
| xHPR2 | - | - | | | |
|
| xHPR2 | - | - | ✔ | ✔ | |
|
||||||
| xSYR | | | - | - | |
|
| xSYR | ✔ | ✔ | - | - | |
|
||||||
| xSPR | | | - | - | |
|
| xSPR | ✔ | ✔ | - | - | |
|
||||||
| xSYR2 | | | - | - | |
|
| xSYR2 | ✔ | ✔ | - | - | |
|
||||||
| xSPR2 | | | - | - | |
|
| xSPR2 | ✔ | ✔ | - | - | |
|
||||||
|
|
||||||
| Level-3 | S | D | C | Z | Notes |
|
| Level-3 | S | D | C | Z | Notes |
|
||||||
| ---------|---|---|---|---|---------|
|
| ---------|---|---|---|---|---------|
|
||||||
|
@ -200,6 +230,12 @@ The contributing authors so far are:
|
||||||
|
|
||||||
* [Cedric Nugteren](http://www.cedricnugteren.nl)
|
* [Cedric Nugteren](http://www.cedricnugteren.nl)
|
||||||
|
|
||||||
|
Tuning and testing on a variety of OpenCL devices was made possible by:
|
||||||
|
|
||||||
|
* [TU/e ES research group](http://www.es.ele.tue.nl/)
|
||||||
|
* [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
|
||||||
|
* [Dividiti](http://www.dividiti.com)
|
||||||
|
* [SURFsara HPC center](http://www.surfsara.com)
|
||||||
|
|
||||||
Support us
|
Support us
|
||||||
-------------
|
-------------
|
||||||
|
@ -210,20 +246,8 @@ This project started in March 2015 as an evenings and weekends free-time project
|
||||||
To-do list before release of version 1.0
|
To-do list before release of version 1.0
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
- Increase the functionality:
|
- Support all routines supported by clBLAS
|
||||||
* Support all routines supported by clBLAS
|
- Allow the user control over events and synchronization
|
||||||
* Allow the user control over events and synchronization
|
- Add half-precision routines (e.g. HGEMM)
|
||||||
* Add half-precision routines (e.g. HGEMM)
|
- Enable correctness and performance testing against a CPU-based BLAS library
|
||||||
- Improve host performance:
|
- Test in multi-threaded environments
|
||||||
* Allow initialization to pre-compile kernels and store to disk
|
|
||||||
- Improve device performance:
|
|
||||||
* Tune for a wider range of devices
|
|
||||||
* Allow users to define custom tuned parameters
|
|
||||||
- Improve the tuning
|
|
||||||
* Make the tuners upload their data to a central server
|
|
||||||
- Improve the performance comparisons:
|
|
||||||
* Enable comparison against optionally: ViennaCL, cuBLAS, MAGMA OpenCL
|
|
||||||
- Further reduce the likelihood of crashes:
|
|
||||||
* Add checks for proper command-line arguments in the tuner, tester and client
|
|
||||||
* Add checks for valid database parameters
|
|
||||||
* Test in multi-threaded environments
|
|
||||||
|
|
|
@ -34,6 +34,7 @@ set(OPENCL_HINTS
|
||||||
set(OPENCL_PATHS
|
set(OPENCL_PATHS
|
||||||
/usr/local/cuda
|
/usr/local/cuda
|
||||||
/opt/cuda
|
/opt/cuda
|
||||||
|
/opt/intel/opencl
|
||||||
/usr
|
/usr
|
||||||
/usr/local
|
/usr/local
|
||||||
)
|
)
|
||||||
|
@ -52,7 +53,7 @@ mark_as_advanced(OPENCL_INCLUDE_DIRS)
|
||||||
find_library(OPENCL_LIBRARIES
|
find_library(OPENCL_LIBRARIES
|
||||||
NAMES OpenCL
|
NAMES OpenCL
|
||||||
HINTS ${OPENCL_HINTS}
|
HINTS ${OPENCL_HINTS}
|
||||||
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
|
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x86_64/sdk lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
|
||||||
PATHS ${OPENCL_PATHS}
|
PATHS ${OPENCL_PATHS}
|
||||||
DOC "OpenCL library"
|
DOC "OpenCL library"
|
||||||
)
|
)
|
||||||
|
|
|
@ -45,7 +45,7 @@ mark_as_advanced(CLBLAS_INCLUDE_DIRS)
|
||||||
find_library(CLBLAS_LIBRARIES
|
find_library(CLBLAS_LIBRARIES
|
||||||
NAMES clBLAS
|
NAMES clBLAS
|
||||||
HINTS ${CLBLAS_HINTS}
|
HINTS ${CLBLAS_HINTS}
|
||||||
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32
|
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
|
||||||
PATHS ${CLBLAS_PATHS}
|
PATHS ${CLBLAS_PATHS}
|
||||||
DOC "clBLAS library"
|
DOC "clBLAS library"
|
||||||
)
|
)
|
||||||
|
|
BIN
doc/performance/Intel_IrisPro/SAXPY.pdf
Normal file
BIN
doc/performance/Intel_IrisPro/SAXPY.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Intel_IrisPro/SGEMM.pdf
Normal file
BIN
doc/performance/Intel_IrisPro/SGEMM.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Intel_IrisPro/SGEMV.pdf
Normal file
BIN
doc/performance/Intel_IrisPro/SGEMV.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Intel_IrisPro/SSYMM.pdf
Normal file
BIN
doc/performance/Intel_IrisPro/SSYMM.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Intel_IrisPro/SSYRK.pdf
Normal file
BIN
doc/performance/Intel_IrisPro/SSYRK.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Radeon_M370X/SAXPY.pdf
Normal file
BIN
doc/performance/Radeon_M370X/SAXPY.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Radeon_M370X/SGEMM.pdf
Normal file
BIN
doc/performance/Radeon_M370X/SGEMM.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Radeon_M370X/SGEMV.pdf
Normal file
BIN
doc/performance/Radeon_M370X/SGEMV.pdf
Normal file
Binary file not shown.
BIN
doc/performance/Radeon_M370X/SSYMM.pdf
Normal file
BIN
doc/performance/Radeon_M370X/SSYMM.pdf
Normal file
Binary file not shown.
|
@ -76,7 +76,7 @@ class Event {
|
||||||
explicit Event(const cl_event event): event_(event) { }
|
explicit Event(const cl_event event): event_(event) { }
|
||||||
|
|
||||||
// Regular constructor
|
// Regular constructor
|
||||||
explicit Event() { }
|
explicit Event(): event_(nullptr) { }
|
||||||
|
|
||||||
// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
|
// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
|
||||||
// the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
|
// the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
|
||||||
|
@ -119,6 +119,13 @@ class Platform {
|
||||||
platform_ = platforms[platform_id];
|
platform_ = platforms[platform_id];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns the number of devices on this platform
|
||||||
|
size_t NumDevices() const {
|
||||||
|
auto result = cl_uint{0};
|
||||||
|
CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result));
|
||||||
|
return static_cast<size_t>(result);
|
||||||
|
}
|
||||||
|
|
||||||
// Accessor to the private data-member
|
// Accessor to the private data-member
|
||||||
const cl_platform_id& operator()() const { return platform_; }
|
const cl_platform_id& operator()() const { return platform_; }
|
||||||
private:
|
private:
|
||||||
|
@ -136,11 +143,11 @@ class Device {
|
||||||
|
|
||||||
// Initialize the device. Note that this constructor can throw exceptions!
|
// Initialize the device. Note that this constructor can throw exceptions!
|
||||||
explicit Device(const Platform &platform, const size_t device_id) {
|
explicit Device(const Platform &platform, const size_t device_id) {
|
||||||
auto num_devices = cl_uint{0};
|
auto num_devices = platform.NumDevices();
|
||||||
CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, 0, nullptr, &num_devices));
|
|
||||||
if (num_devices == 0) { Error("no devices found"); }
|
if (num_devices == 0) { Error("no devices found"); }
|
||||||
auto devices = std::vector<cl_device_id>(num_devices);
|
auto devices = std::vector<cl_device_id>(num_devices);
|
||||||
CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, num_devices, devices.data(), nullptr));
|
CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
|
||||||
|
devices.data(), nullptr));
|
||||||
if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
|
if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
|
||||||
device_ = devices[device_id];
|
device_ = devices[device_id];
|
||||||
}
|
}
|
||||||
|
@ -172,6 +179,7 @@ class Device {
|
||||||
size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
|
size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
|
||||||
size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
|
size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
|
||||||
size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
|
size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
|
||||||
|
size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
|
||||||
size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
|
size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
|
||||||
size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
|
size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
|
||||||
|
|
||||||
|
@ -225,7 +233,7 @@ class Device {
|
||||||
auto result = std::string{};
|
auto result = std::string{};
|
||||||
result.resize(bytes);
|
result.resize(bytes);
|
||||||
CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
|
CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
|
||||||
return std::string{result.c_str()};
|
return std::string{result.c_str()}; // Removes any trailing '\0'-characters
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -342,7 +350,12 @@ class Queue {
|
||||||
queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
|
queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
|
||||||
delete s; }) {
|
delete s; }) {
|
||||||
auto status = CL_SUCCESS;
|
auto status = CL_SUCCESS;
|
||||||
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
|
#ifdef CL_VERSION_2_0
|
||||||
|
cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
|
||||||
|
*queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
|
||||||
|
#else
|
||||||
|
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
|
||||||
|
#endif
|
||||||
CheckError(status);
|
CheckError(status);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -408,7 +421,7 @@ class BufferHost {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Enumeration of buffer access types
|
// Enumeration of buffer access types
|
||||||
enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite };
|
enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
|
||||||
|
|
||||||
// C++11 version of 'cl_mem'
|
// C++11 version of 'cl_mem'
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -418,13 +431,17 @@ class Buffer {
|
||||||
// Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
|
// Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
|
||||||
explicit Buffer(const cl_mem buffer):
|
explicit Buffer(const cl_mem buffer):
|
||||||
buffer_(new cl_mem),
|
buffer_(new cl_mem),
|
||||||
access_(BufferAccess::kReadWrite) {
|
access_(BufferAccess::kNotOwned) {
|
||||||
*buffer_ = buffer;
|
*buffer_ = buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Regular constructor with memory management
|
// Regular constructor with memory management. If this class does not own the buffer object, then
|
||||||
|
// the memory will not be freed automatically afterwards.
|
||||||
explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
|
explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
|
||||||
buffer_(new cl_mem, [](cl_mem* m) { CheckError(clReleaseMemObject(*m)); delete m; }),
|
buffer_(new cl_mem, [access](cl_mem* m) {
|
||||||
|
if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); }
|
||||||
|
delete m;
|
||||||
|
}),
|
||||||
access_(access) {
|
access_(access) {
|
||||||
auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
|
auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
|
||||||
if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
|
if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
|
||||||
|
@ -439,57 +456,74 @@ class Buffer {
|
||||||
Buffer<T>(context, BufferAccess::kReadWrite, size) {
|
Buffer<T>(context, BufferAccess::kReadWrite, size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Constructs a new buffer based on an existing host-container
|
||||||
|
template <typename Iterator>
|
||||||
|
explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
|
||||||
|
Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
|
||||||
|
auto size = static_cast<size_t>(end - start);
|
||||||
|
auto pointer = &*start;
|
||||||
|
CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0,
|
||||||
|
nullptr, nullptr));
|
||||||
|
queue.Finish();
|
||||||
|
}
|
||||||
|
|
||||||
// Copies from device to host: reading the device buffer a-synchronously
|
// Copies from device to host: reading the device buffer a-synchronously
|
||||||
void ReadAsync(const Queue &queue, const size_t size, T* host) {
|
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
|
||||||
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
|
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
|
||||||
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0,
|
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
|
||||||
nullptr, nullptr));
|
host, 0, nullptr, nullptr));
|
||||||
}
|
}
|
||||||
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host) {
|
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
|
||||||
|
const size_t offset = 0) {
|
||||||
if (host.size() < size) { Error("target host buffer is too small"); }
|
if (host.size() < size) { Error("target host buffer is too small"); }
|
||||||
ReadAsync(queue, size, host.data());
|
ReadAsync(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host) {
|
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
|
||||||
|
const size_t offset = 0) {
|
||||||
if (host.size() < size) { Error("target host buffer is too small"); }
|
if (host.size() < size) { Error("target host buffer is too small"); }
|
||||||
ReadAsync(queue, size, host.data());
|
ReadAsync(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copies from device to host: reading the device buffer
|
// Copies from device to host: reading the device buffer
|
||||||
void Read(const Queue &queue, const size_t size, T* host) {
|
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
|
||||||
ReadAsync(queue, size, host);
|
ReadAsync(queue, size, host, offset);
|
||||||
queue.Finish();
|
queue.Finish();
|
||||||
}
|
}
|
||||||
void Read(const Queue &queue, const size_t size, std::vector<T> &host) {
|
void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
|
||||||
Read(queue, size, host.data());
|
Read(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
void Read(const Queue &queue, const size_t size, BufferHost<T> &host) {
|
void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
|
||||||
Read(queue, size, host.data());
|
Read(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copies from host to device: writing the device buffer a-synchronously
|
// Copies from host to device: writing the device buffer a-synchronously
|
||||||
void WriteAsync(const Queue &queue, const size_t size, const T* host) {
|
void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
|
||||||
if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
|
if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
|
||||||
if (GetSize() < size*sizeof(T)) { Error("target device buffer is too small"); }
|
if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
|
||||||
CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0,
|
CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
|
||||||
nullptr, nullptr));
|
host, 0, nullptr, nullptr));
|
||||||
}
|
}
|
||||||
void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host) {
|
void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
|
||||||
WriteAsync(queue, size, host.data());
|
const size_t offset = 0) {
|
||||||
|
WriteAsync(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host) {
|
void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
|
||||||
WriteAsync(queue, size, host.data());
|
const size_t offset = 0) {
|
||||||
|
WriteAsync(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copies from host to device: writing the device buffer
|
// Copies from host to device: writing the device buffer
|
||||||
void Write(const Queue &queue, const size_t size, const T* host) {
|
void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
|
||||||
WriteAsync(queue, size, host);
|
WriteAsync(queue, size, host, offset);
|
||||||
queue.Finish();
|
queue.Finish();
|
||||||
}
|
}
|
||||||
void Write(const Queue &queue, const size_t size, const std::vector<T> &host) {
|
void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
|
||||||
Write(queue, size, host.data());
|
const size_t offset = 0) {
|
||||||
|
Write(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
void Write(const Queue &queue, const size_t size, const BufferHost<T> &host) {
|
void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
|
||||||
Write(queue, size, host.data());
|
const size_t offset = 0) {
|
||||||
|
Write(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copies the contents of this buffer into another device buffer
|
// Copies the contents of this buffer into another device buffer
|
||||||
|
@ -573,6 +607,13 @@ class Kernel {
|
||||||
0, nullptr, &(event())));
|
0, nullptr, &(event())));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// As above, but with the default local workgroup size
|
||||||
|
void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
|
||||||
|
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
|
||||||
|
nullptr, global.data(), nullptr,
|
||||||
|
0, nullptr, &(event())));
|
||||||
|
}
|
||||||
|
|
||||||
// Accessor to the private data-member
|
// Accessor to the private data-member
|
||||||
const cl_kernel& operator()() const { return *kernel_; }
|
const cl_kernel& operator()() const { return *kernel_; }
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -56,24 +56,26 @@ class Database {
|
||||||
static constexpr auto kDeviceTypeAll = "default";
|
static constexpr auto kDeviceTypeAll = "default";
|
||||||
|
|
||||||
// The OpenCL device vendors
|
// The OpenCL device vendors
|
||||||
static constexpr auto kDeviceVendorNVIDIA = "NVIDIA Corporation";
|
|
||||||
static constexpr auto kDeviceVendorAMD = "Advanced Micro Devices, Inc.";
|
|
||||||
static constexpr auto kDeviceVendorIntel = "Intel";
|
|
||||||
static constexpr auto kDeviceVendorAll = "default";
|
static constexpr auto kDeviceVendorAll = "default";
|
||||||
|
|
||||||
// The OpenCL device names
|
// Alternative names for some OpenCL vendors
|
||||||
static constexpr auto kDefaultDevice = "default";
|
const std::unordered_map<std::string,std::string> kVendorNames {
|
||||||
|
{"Intel(R) Corporation", "Intel"},
|
||||||
|
{"GenuineIntel", "Intel"},
|
||||||
|
{"Advanced Micro Devices, Inc.", "AMD"},
|
||||||
|
{"NVIDIA Corporation", "NVIDIA"},
|
||||||
|
};
|
||||||
|
|
||||||
// The database consists of separate database entries, stored together in a vector
|
// The database consists of separate database entries, stored together in a vector
|
||||||
static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
|
static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
|
||||||
static const DatabaseEntry XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
|
static const DatabaseEntry XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
|
||||||
static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
|
static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
|
||||||
|
static const DatabaseEntry XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
|
||||||
static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
|
static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
|
||||||
static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
|
static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
|
||||||
static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
|
static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
|
||||||
static const DatabaseEntry TraSingle, TraDouble, TraComplexSingle, TraComplexDouble;
|
static const DatabaseEntry TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
|
||||||
static const DatabaseEntry PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble;
|
static const DatabaseEntry PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
|
||||||
static const std::vector<DatabaseEntry> database;
|
static const std::vector<DatabaseEntry> database;
|
||||||
|
|
||||||
// The constructor
|
// The constructor
|
||||||
|
|
|
@ -5,9 +5,9 @@
|
||||||
// width of 100 characters per line.
|
// width of 100 characters per line.
|
||||||
//
|
//
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Database generator <database.py>
|
||||||
//
|
//
|
||||||
// This file populates the database with best-found tuning parameters for the Copy kernels.
|
// This file populates the database with best-found tuning parameters for the 'Copy' kernels.
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -16,54 +16,56 @@ namespace clblast {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::CopySingle = {
|
const Database::DatabaseEntry Database::CopySingle = {
|
||||||
"Copy", Precision::kSingle, {
|
"Copy", Precision::kSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
||||||
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_WPT",2}, {"COPY_VW",4} } },
|
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
|
||||||
{ "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",4}, {"COPY_VW",4} } },
|
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
|
{ "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
|
||||||
|
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",4} } },
|
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
|
||||||
|
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
|
||||||
|
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Intel accelerators
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
{ kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::CopyDouble = {
|
|
||||||
"Copy", Precision::kDouble, {
|
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
{ "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
||||||
{ "Tesla K20m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
|
{ "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
||||||
{ "Tesla K40m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
|
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
}
|
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
},
|
{ "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
|
||||||
{ // AMD GPUs
|
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
|
||||||
{ "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
|
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
|
||||||
}
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
},
|
|
||||||
{ // Intel GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -73,26 +75,100 @@ const Database::DatabaseEntry Database::CopyDouble = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::CopyComplexSingle = {
|
const Database::DatabaseEntry Database::CopyComplexSingle = {
|
||||||
"Copy", Precision::kComplexSingle, {
|
"Copy", Precision::kComplexSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
{ "Tesla K20m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",1} } },
|
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
|
||||||
{ "Tesla K40m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // Intel CPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeCPU, "Intel", {
|
||||||
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
|
||||||
|
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
|
||||||
|
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
|
||||||
|
{ "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::CopyDouble = {
|
||||||
|
"Copy", Precision::kDouble, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // ARM GPUs
|
||||||
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
|
||||||
|
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
|
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
|
||||||
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -102,25 +178,49 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::CopyComplexDouble = {
|
const Database::DatabaseEntry Database::CopyComplexDouble = {
|
||||||
"Copy", Precision::kComplexDouble, {
|
"Copy", Precision::kComplexDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
|
||||||
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
|
||||||
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
|
||||||
|
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
|
||||||
|
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,9 +5,9 @@
|
||||||
// width of 100 characters per line.
|
// width of 100 characters per line.
|
||||||
//
|
//
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Database generator <database.py>
|
||||||
//
|
//
|
||||||
// This file populates the database with best-found tuning parameters for the Pad kernels.
|
// This file populates the database with best-found tuning parameters for the 'Pad' kernels.
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -16,54 +16,56 @@ namespace clblast {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::PadSingle = {
|
const Database::DatabaseEntry Database::PadSingle = {
|
||||||
"Pad", Precision::kSingle, {
|
"Pad", Precision::kSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
{ "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
{ "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
{ "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Intel accelerators
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
{ kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::PadDouble = {
|
|
||||||
"Pad", Precision::kDouble, {
|
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
|
||||||
{ "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
}
|
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
},
|
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
{ // AMD GPUs
|
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
{ "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
},
|
|
||||||
{ // Intel GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -73,26 +75,108 @@ const Database::DatabaseEntry Database::PadDouble = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::PadComplexSingle = {
|
const Database::DatabaseEntry Database::PadComplexSingle = {
|
||||||
"Pad", Precision::kComplexSingle, {
|
"Pad", Precision::kComplexSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
{ "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
|
||||||
|
{ "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
|
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::PadDouble = {
|
||||||
|
"Pad", Precision::kDouble, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // ARM GPUs
|
||||||
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
|
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -102,25 +186,49 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::PadComplexDouble = {
|
const Database::DatabaseEntry Database::PadComplexDouble = {
|
||||||
"Pad", Precision::kComplexDouble, {
|
"Pad", Precision::kComplexDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
|
||||||
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
|
||||||
{ "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
|
{ "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,37 +5,67 @@
|
||||||
// width of 100 characters per line.
|
// width of 100 characters per line.
|
||||||
//
|
//
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Database generator <database.py>
|
||||||
//
|
//
|
||||||
// This file populates the database with best-found tuning parameters for the PadTranspose kernels.
|
// This file populates the database with best-found tuning parameters for the 'Padtranspose' kernels.
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::PadTraSingle = {
|
const Database::DatabaseEntry Database::PadtransposeSingle = {
|
||||||
"PadTranspose", Precision::kSingle, {
|
"Padtranspose", Precision::kSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
{ "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
|
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
{ "Tesla K40m", { {"PADTRA_TILE",32}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
|
{ "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
|
{ "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -43,27 +73,58 @@ const Database::DatabaseEntry Database::PadTraSingle = {
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::PadTraDouble = {
|
const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
|
||||||
"PadTranspose", Precision::kDouble, {
|
"Padtranspose", Precision::kComplexSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
{ "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
|
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
{ "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
|
{ "Mali-T628", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
|
{ "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -71,28 +132,51 @@ const Database::DatabaseEntry Database::PadTraDouble = {
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::PadTraComplexSingle = {
|
const Database::DatabaseEntry Database::PadtransposeDouble = {
|
||||||
"PadTranspose", Precision::kComplexSingle, {
|
"Padtranspose", Precision::kDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
|
|
||||||
{ "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
|
|
||||||
{ "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Iris", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
|
{ "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -100,27 +184,51 @@ const Database::DatabaseEntry Database::PadTraComplexSingle = {
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::PadTraComplexDouble = {
|
const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
|
||||||
"PadTranspose", Precision::kComplexDouble, {
|
"Padtranspose", Precision::kComplexDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
|
|
||||||
{ "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
|
|
||||||
{ "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,37 +5,67 @@
|
||||||
// width of 100 characters per line.
|
// width of 100 characters per line.
|
||||||
//
|
//
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Database generator <database.py>
|
||||||
//
|
//
|
||||||
// This file populates the database with best-found tuning parameters for the Transpose kernels.
|
// This file populates the database with best-found tuning parameters for the 'Transpose' kernels.
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::TraSingle = {
|
const Database::DatabaseEntry Database::TransposeSingle = {
|
||||||
"Transpose", Precision::kSingle, {
|
"Transpose", Precision::kSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
|
||||||
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
|
{ "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
|
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
|
||||||
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
|
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
{ "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
{ "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -43,56 +73,52 @@ const Database::DatabaseEntry Database::TraSingle = {
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::TraDouble = {
|
const Database::DatabaseEntry Database::TransposeComplexSingle = {
|
||||||
"Transpose", Precision::kDouble, {
|
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
|
||||||
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
|
||||||
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
|
||||||
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Intel GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Default
|
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
|
||||||
{ kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::TraComplexSingle = {
|
|
||||||
"Transpose", Precision::kComplexSingle, {
|
"Transpose", Precision::kComplexSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1} } },
|
{ "Mali-T628", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -100,27 +126,97 @@ const Database::DatabaseEntry Database::TraComplexSingle = {
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::TraComplexDouble = {
|
const Database::DatabaseEntry Database::TransposeDouble = {
|
||||||
"Transpose", Precision::kComplexDouble, {
|
"Transpose", Precision::kDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
|
||||||
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
|
||||||
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
|
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
|
||||||
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "default", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
|
||||||
|
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::TransposeComplexDouble = {
|
||||||
|
"Transpose", Precision::kComplexDouble, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
|
||||||
|
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // ARM GPUs
|
||||||
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,9 +5,9 @@
|
||||||
// width of 100 characters per line.
|
// width of 100 characters per line.
|
||||||
//
|
//
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Database generator <database.py>
|
||||||
//
|
//
|
||||||
// This file populates the database with best-found tuning parameters for the Xaxpy kernels.
|
// This file populates the database with best-found tuning parameters for the 'Xaxpy' kernels.
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -16,26 +16,115 @@ namespace clblast {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XaxpySingle = {
|
const Database::DatabaseEntry Database::XaxpySingle = {
|
||||||
"Xaxpy", Precision::kSingle, {
|
"Xaxpy", Precision::kSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"WGS",128}, {"WPT",1}, {"VW",2} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
{ "Tesla K20m", { {"WGS",128}, {"WPT",2}, {"VW",2} } },
|
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
|
||||||
{ "Tesla K40m", { {"WGS",128}, {"WPT",1}, {"VW",4} } },
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",2} } },
|
{ "Mali-T628", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"WGS",512}, {"WPT",1}, {"VW",1} } },
|
{ "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
|
||||||
|
{ "default", { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::XaxpyComplexSingle = {
|
||||||
|
"Xaxpy", Precision::kComplexSingle, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } },
|
||||||
|
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // ARM GPUs
|
||||||
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel GPUs
|
||||||
|
kDeviceTypeGPU, "Intel", {
|
||||||
|
{ "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -45,53 +134,49 @@ const Database::DatabaseEntry Database::XaxpySingle = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XaxpyDouble = {
|
const Database::DatabaseEntry Database::XaxpyDouble = {
|
||||||
"Xaxpy", Precision::kDouble, {
|
"Xaxpy", Precision::kDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"WGS",128}, {"WPT",1}, {"VW",1} } },
|
|
||||||
{ "Tesla K20m", { {"WGS",512}, {"WPT",1}, {"VW",2} } },
|
|
||||||
{ "Tesla K40m", { {"WGS",64}, {"WPT",1}, {"VW",2} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "Tahiti", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"VW",2}, {"WGS",128}, {"WPT",2} } },
|
||||||
|
{ "default", { {"VW",2}, {"WGS",128}, {"WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
|
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
};
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XaxpyComplexSingle = {
|
|
||||||
"Xaxpy", Precision::kComplexSingle, {
|
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
|
|
||||||
{ "Tesla K20m", { {"WGS",128}, {"WPT",1}, {"VW",1} } },
|
|
||||||
{ "Tesla K40m", { {"WGS",128}, {"WPT",2}, {"VW",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
|
||||||
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Intel GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
|
||||||
{ "Iris", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Default
|
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
|
||||||
{ kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -101,25 +186,49 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XaxpyComplexDouble = {
|
const Database::DatabaseEntry Database::XaxpyComplexDouble = {
|
||||||
"Xaxpy", Precision::kComplexDouble, {
|
"Xaxpy", Precision::kComplexDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"WGS",128}, {"WPT",2}, {"VW",1} } },
|
|
||||||
{ "Tesla K20m", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
|
|
||||||
{ "Tesla K40m", { {"WGS",64}, {"WPT",2}, {"VW",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
|
||||||
|
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
|
||||||
|
{ "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,9 +5,9 @@
|
||||||
// width of 100 characters per line.
|
// width of 100 characters per line.
|
||||||
//
|
//
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Database generator <database.py>
|
||||||
//
|
//
|
||||||
// This file populates the database with best-found tuning parameters for the Xdot kernels.
|
// This file populates the database with best-found tuning parameters for the 'Xdot' kernels.
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -16,22 +16,115 @@ namespace clblast {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XdotSingle = {
|
const Database::DatabaseEntry Database::XdotSingle = {
|
||||||
"Xdot", Precision::kSingle, {
|
"Xdot", Precision::kSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
||||||
|
{ "Tahiti", { {"VW",1}, {"WGS1",256}, {"WGS2",256} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"WGS1",512}, {"WGS2",512} } },
|
{ "Iris", { {"VW",1}, {"WGS1",512}, {"WGS2",32} } },
|
||||||
|
{ "Iris Pro", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",256}, {"WGS2",128} } },
|
||||||
|
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
||||||
|
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } },
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::XdotComplexSingle = {
|
||||||
|
"Xdot", Precision::kComplexSingle, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
||||||
|
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // ARM GPUs
|
||||||
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel GPUs
|
||||||
|
kDeviceTypeGPU, "Intel", {
|
||||||
|
{ "Iris", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
||||||
|
{ "Iris Pro", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
|
||||||
|
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",256}, {"WGS2",32} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
|
||||||
|
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
||||||
|
{ "Tesla K20m", { {"VW",1}, {"WGS1",256}, {"WGS2",512} } },
|
||||||
|
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -41,45 +134,49 @@ const Database::DatabaseEntry Database::XdotSingle = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XdotDouble = {
|
const Database::DatabaseEntry Database::XdotDouble = {
|
||||||
"Xdot", Precision::kDouble, {
|
"Xdot", Precision::kDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
||||||
|
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",1024}, {"WGS2",512} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
||||||
|
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",32}, {"WGS2",512} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
|
||||||
|
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
||||||
|
{ "Tesla K40m", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } },
|
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
};
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XdotComplexSingle = {
|
|
||||||
"Xdot", Precision::kComplexSingle, {
|
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Intel GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
|
||||||
{ "Iris", { {"WGS1",512}, {"WGS2",512} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Default
|
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
|
||||||
{ kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -89,21 +186,49 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XdotComplexDouble = {
|
const Database::DatabaseEntry Database::XdotComplexDouble = {
|
||||||
"Xdot", Precision::kComplexDouble, {
|
"Xdot", Precision::kComplexDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
||||||
|
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
|
||||||
|
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",256}, {"WGS2",64} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
|
||||||
|
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
|
||||||
|
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
||||||
|
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
||||||
|
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } },
|
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",32} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,9 +5,9 @@
|
||||||
// width of 100 characters per line.
|
// width of 100 characters per line.
|
||||||
//
|
//
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Database generator <database.py>
|
||||||
//
|
//
|
||||||
// This file populates the database with best-found tuning parameters for the Xgemm kernels.
|
// This file populates the database with best-found tuning parameters for the 'Xgemm' kernels.
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -16,56 +16,56 @@ namespace clblast {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XgemmSingle = {
|
const Database::DatabaseEntry Database::XgemmSingle = {
|
||||||
"Xgemm", Precision::kSingle, {
|
"Xgemm", Precision::kSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"MWG",128}, {"NWG",64}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
|
||||||
{ "Tesla K20m", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",4}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
|
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
|
||||||
{ "Tesla K40m", { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
|
||||||
{ kDefaultDevice, { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Tahiti", { {"MWG",128}, {"NWG",128}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",8}, {"KWI",2}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
|
{ "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
|
||||||
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
|
{ "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
|
||||||
|
{ "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
|
||||||
|
{ "default", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Intel accelerators
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
{ kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XgemmDouble = {
|
|
||||||
"Xgemm", Precision::kDouble, {
|
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
{ "GeForce GTX 480", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
|
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
||||||
{ "Tesla K20m", { {"MWG",64}, {"NWG",128}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",32}, {"KWI",8}, {"VWM",2}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
|
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
|
||||||
{ "Tesla K40m", { {"MWG",64}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
|
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
|
||||||
{ kDefaultDevice, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
|
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
|
||||||
}
|
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
|
||||||
},
|
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
|
||||||
{ // AMD GPUs
|
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
||||||
{ "Tahiti", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Intel GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -75,27 +75,108 @@ const Database::DatabaseEntry Database::XgemmDouble = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XgemmComplexSingle = {
|
const Database::DatabaseEntry Database::XgemmComplexSingle = {
|
||||||
"Xgemm", Precision::kComplexSingle, {
|
"Xgemm", Precision::kComplexSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
|
||||||
{ "Tesla K20m", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",8}, {"KWI",8}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
|
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
|
||||||
{ "Tesla K40m", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",0}, {"STRN",1}, {"SA",1}, {"SB",1} } },
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
|
||||||
{ kDefaultDevice, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
{ "Tahiti", { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
|
{ "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
|
||||||
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
|
{ "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
||||||
|
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::XgemmDouble = {
|
||||||
|
"Xgemm", Precision::kDouble, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
|
||||||
|
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // ARM GPUs
|
||||||
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
|
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
|
||||||
|
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
|
||||||
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -105,29 +186,52 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XgemmComplexDouble = {
|
const Database::DatabaseEntry Database::XgemmComplexDouble = {
|
||||||
"Xgemm", Precision::kComplexDouble, {
|
"Xgemm", Precision::kComplexDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
|
|
||||||
{ "Tesla K20m", { {"MWG",16}, {"NWG",128}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",8}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
|
|
||||||
{ "Tesla K40m", { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",1} } },
|
|
||||||
{ kDefaultDevice, { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "Tahiti", { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
||||||
|
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // ARM GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
|
||||||
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
|
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
} // namespace clblast
|
} // namespace clblast
|
||||||
|
|
|
@ -5,9 +5,9 @@
|
||||||
// width of 100 characters per line.
|
// width of 100 characters per line.
|
||||||
//
|
//
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Database generator <database.py>
|
||||||
//
|
//
|
||||||
// This file populates the database with best-found tuning parameters for the Xgemv kernels.
|
// This file populates the database with best-found tuning parameters for the 'Xgemv' kernels.
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -16,26 +16,97 @@ namespace clblast {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XgemvSingle = {
|
const Database::DatabaseEntry Database::XgemvSingle = {
|
||||||
"Xgemv", Precision::kSingle, {
|
"Xgemv", Precision::kSingle, {
|
||||||
{ // NVIDIA GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"WGS2",256}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",4} } },
|
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // AMD GPUs
|
{ // Intel CPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeCPU, "Intel", {
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"WGS1",256}, {"WPT1",2}, {"WGS2",64}, {"WPT2",4}, {"VW2",4}, {"WGS3",256}, {"WPT3",2}, {"VW3",8} } },
|
{ "Iris", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
|
||||||
|
{ "Iris Pro", { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
|
||||||
|
{ "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
|
||||||
|
{ "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
||||||
|
{ "Tesla K20m", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
|
||||||
|
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::XgemvComplexSingle = {
|
||||||
|
"Xgemv", Precision::kComplexSingle, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
||||||
|
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel GPUs
|
||||||
|
kDeviceTypeGPU, "Intel", {
|
||||||
|
{ "Iris", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "Iris Pro", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -45,53 +116,42 @@ const Database::DatabaseEntry Database::XgemvSingle = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XgemvDouble = {
|
const Database::DatabaseEntry Database::XgemvDouble = {
|
||||||
"Xgemv", Precision::kDouble, {
|
"Xgemv", Precision::kDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
||||||
|
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel CPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
|
||||||
|
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
|
||||||
|
{ "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
|
||||||
|
{ "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
||||||
|
{ "Tesla K20m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
};
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XgemvComplexSingle = {
|
|
||||||
"Xgemv", Precision::kComplexSingle, {
|
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Intel GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
|
||||||
{ "Iris", { {"WGS1",256}, {"WPT1",1}, {"WGS2",64}, {"WPT2",4}, {"VW2",2}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Default
|
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
|
||||||
{ kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -101,25 +161,35 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
|
||||||
|
|
||||||
const Database::DatabaseEntry Database::XgemvComplexDouble = {
|
const Database::DatabaseEntry Database::XgemvComplexDouble = {
|
||||||
"Xgemv", Precision::kComplexDouble, {
|
"Xgemv", Precision::kComplexDouble, {
|
||||||
{ // NVIDIA GPUs
|
|
||||||
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
|
|
||||||
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorAMD, {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
||||||
|
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel CPUs
|
||||||
kDeviceTypeGPU, kDeviceVendorIntel, {
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel accelerators
|
||||||
|
kDeviceTypeAccelerator, "Intel", {
|
||||||
|
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, kDeviceVendorAll, {
|
kDeviceTypeAll, "default", {
|
||||||
{ kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
188
include/internal/database/xger.h
Normal file
188
include/internal/database/xger.h
Normal file
|
@ -0,0 +1,188 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Database generator <database.py>
|
||||||
|
//
|
||||||
|
// This file populates the database with best-found tuning parameters for the 'Xger' kernels.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::XgerSingle = {
|
||||||
|
"Xger", Precision::kSingle, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // ARM GPUs
|
||||||
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
|
||||||
|
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel GPUs
|
||||||
|
kDeviceTypeGPU, "Intel", {
|
||||||
|
{ "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
|
||||||
|
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
|
||||||
|
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::XgerComplexSingle = {
|
||||||
|
"Xger", Precision::kComplexSingle, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
|
||||||
|
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // ARM GPUs
|
||||||
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
|
||||||
|
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel GPUs
|
||||||
|
kDeviceTypeGPU, "Intel", {
|
||||||
|
{ "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
|
||||||
|
{ "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
|
||||||
|
{ "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
|
||||||
|
{ "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::XgerDouble = {
|
||||||
|
"Xger", Precision::kDouble, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
|
||||||
|
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
|
||||||
|
{ "default", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // ARM GPUs
|
||||||
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
|
||||||
|
{ "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
|
||||||
|
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
|
||||||
|
{ "default", { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
const Database::DatabaseEntry Database::XgerComplexDouble = {
|
||||||
|
"Xger", Precision::kComplexDouble, {
|
||||||
|
{ // AMD GPUs
|
||||||
|
kDeviceTypeGPU, "AMD", {
|
||||||
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
{ "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
|
||||||
|
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // ARM GPUs
|
||||||
|
kDeviceTypeGPU, "ARM", {
|
||||||
|
{ "Mali-T628", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
|
||||||
|
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Intel CPUs
|
||||||
|
kDeviceTypeCPU, "Intel", {
|
||||||
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
|
||||||
|
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
|
||||||
|
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // NVIDIA GPUs
|
||||||
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
|
{ "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
|
||||||
|
{ "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
|
||||||
|
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
|
||||||
|
{ "default", { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ // Default
|
||||||
|
kDeviceTypeAll, "default", {
|
||||||
|
{ "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
58
include/internal/routines/level2/xger.h
Normal file
58
include/internal/routines/level2/xger.h
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xger routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XGER_H_
|
||||||
|
#define CLBLAST_ROUTINES_XGER_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xger: public Routine<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Members and methods from the base class
|
||||||
|
using Routine<T>::db_;
|
||||||
|
using Routine<T>::source_string_;
|
||||||
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::GetProgramFromCache;
|
||||||
|
using Routine<T>::TestVectorX;
|
||||||
|
using Routine<T>::TestVectorY;
|
||||||
|
using Routine<T>::TestMatrixA;
|
||||||
|
using Routine<T>::RunKernel;
|
||||||
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xger(Queue &queue, Event &event, const std::string &name = "GER");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoGer(const Layout layout,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Static variable to get the precision
|
||||||
|
const static Precision precision_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XGER_H_
|
||||||
|
#endif
|
46
include/internal/routines/level2/xgerc.h
Normal file
46
include/internal/routines/level2/xgerc.h
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xgerc routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XGERC_H_
|
||||||
|
#define CLBLAST_ROUTINES_XGERC_H_
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xger.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xgerc: public Xger<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses the regular Xger routine
|
||||||
|
using Xger<T>::DoGer;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xgerc(Queue &queue, Event &event, const std::string &name = "GERC");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoGerc(const Layout layout,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XGERC_H_
|
||||||
|
#endif
|
46
include/internal/routines/level2/xgeru.h
Normal file
46
include/internal/routines/level2/xgeru.h
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xgeru routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XGERU_H_
|
||||||
|
#define CLBLAST_ROUTINES_XGERU_H_
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xger.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xgeru: public Xger<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses the regular Xger routine
|
||||||
|
using Xger<T>::DoGer;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xgeru(Queue &queue, Event &event, const std::string &name = "GERU");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoGeru(const Layout layout,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XGERU_H_
|
||||||
|
#endif
|
61
include/internal/routines/level2/xher.h
Normal file
61
include/internal/routines/level2/xher.h
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xher routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XHER_H_
|
||||||
|
#define CLBLAST_ROUTINES_XHER_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T, typename U>
|
||||||
|
class Xher: public Routine<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Members and methods from the base class
|
||||||
|
using Routine<T>::db_;
|
||||||
|
using Routine<T>::source_string_;
|
||||||
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::GetProgramFromCache;
|
||||||
|
using Routine<T>::TestVectorX;
|
||||||
|
using Routine<T>::TestMatrixA;
|
||||||
|
using Routine<T>::TestMatrixAP;
|
||||||
|
using Routine<T>::RunKernel;
|
||||||
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xher(Queue &queue, Event &event, const std::string &name = "HER");
|
||||||
|
|
||||||
|
// Translates alpha of type 'U' into type 'T'
|
||||||
|
T GetAlpha(const U alpha);
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoHer(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const U alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const bool packed = false);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Static variable to get the precision
|
||||||
|
const static Precision precision_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XHER_H_
|
||||||
|
#endif
|
60
include/internal/routines/level2/xher2.h
Normal file
60
include/internal/routines/level2/xher2.h
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xher2 routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XHER2_H_
|
||||||
|
#define CLBLAST_ROUTINES_XHER2_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xher2: public Routine<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Members and methods from the base class
|
||||||
|
using Routine<T>::db_;
|
||||||
|
using Routine<T>::source_string_;
|
||||||
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::GetProgramFromCache;
|
||||||
|
using Routine<T>::TestVectorX;
|
||||||
|
using Routine<T>::TestVectorY;
|
||||||
|
using Routine<T>::TestMatrixA;
|
||||||
|
using Routine<T>::TestMatrixAP;
|
||||||
|
using Routine<T>::RunKernel;
|
||||||
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xher2(Queue &queue, Event &event, const std::string &name = "HER2");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoHer2(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const bool packed = false);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Static variable to get the precision
|
||||||
|
const static Precision precision_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XHER2_H_
|
||||||
|
#endif
|
45
include/internal/routines/level2/xhpr.h
Normal file
45
include/internal/routines/level2/xhpr.h
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xhpr routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XHPR_H_
|
||||||
|
#define CLBLAST_ROUTINES_XHPR_H_
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xher.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T, typename U>
|
||||||
|
class Xhpr: public Xher<T,U> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses the regular Xher routine
|
||||||
|
using Xher<T,U>::DoHer;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xhpr(Queue &queue, Event &event, const std::string &name = "HPR");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoHpr(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const U alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &ap_buffer, const size_t ap_offset);
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XHPR_H_
|
||||||
|
#endif
|
46
include/internal/routines/level2/xhpr2.h
Normal file
46
include/internal/routines/level2/xhpr2.h
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xhpr2 routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XHPR2_H_
|
||||||
|
#define CLBLAST_ROUTINES_XHPR2_H_
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xher2.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xhpr2: public Xher2<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses the regular Xher2 routine
|
||||||
|
using Xher2<T>::DoHer2;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xhpr2(Queue &queue, Event &event, const std::string &name = "HPR2");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoHpr2(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &ap_buffer, const size_t ap_offset);
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XHPR2_H_
|
||||||
|
#endif
|
45
include/internal/routines/level2/xspr.h
Normal file
45
include/internal/routines/level2/xspr.h
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xspr routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XSPR_H_
|
||||||
|
#define CLBLAST_ROUTINES_XSPR_H_
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xher.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xspr: public Xher<T,T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses the regular Xher routine
|
||||||
|
using Xher<T,T>::DoHer;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xspr(Queue &queue, Event &event, const std::string &name = "SPR");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoSpr(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &ap_buffer, const size_t ap_offset);
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XSPR_H_
|
||||||
|
#endif
|
46
include/internal/routines/level2/xspr2.h
Normal file
46
include/internal/routines/level2/xspr2.h
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xspr2 routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XSPR2_H_
|
||||||
|
#define CLBLAST_ROUTINES_XSPR2_H_
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xher2.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xspr2: public Xher2<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses the regular Xher2 routine
|
||||||
|
using Xher2<T>::DoHer2;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xspr2(Queue &queue, Event &event, const std::string &name = "SPR2");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoSpr2(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &ap_buffer, const size_t ap_offset);
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XSPR2_H_
|
||||||
|
#endif
|
45
include/internal/routines/level2/xsyr.h
Normal file
45
include/internal/routines/level2/xsyr.h
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xsyr routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XSYR_H_
|
||||||
|
#define CLBLAST_ROUTINES_XSYR_H_
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xher.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xsyr: public Xher<T,T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses the regular Xher routine
|
||||||
|
using Xher<T,T>::DoHer;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xsyr(Queue &queue, Event &event, const std::string &name = "SYR");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoSyr(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XSYR_H_
|
||||||
|
#endif
|
46
include/internal/routines/level2/xsyr2.h
Normal file
46
include/internal/routines/level2/xsyr2.h
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xsyr2 routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XSYR2_H_
|
||||||
|
#define CLBLAST_ROUTINES_XSYR2_H_
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xher2.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xsyr2: public Xher2<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Uses the regular Xher2 routine
|
||||||
|
using Xher2<T>::DoHer2;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xsyr2(Queue &queue, Event &event, const std::string &name = "SYR2");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoSyr2(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XSYR2_H_
|
||||||
|
#endif
|
|
@ -127,9 +127,11 @@ void Tuner(int argc, char* argv[]) {
|
||||||
{"precision", precision_string}
|
{"precision", precision_string}
|
||||||
};
|
};
|
||||||
for (auto &o: C::GetOptions()) {
|
for (auto &o: C::GetOptions()) {
|
||||||
if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); }
|
if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); }
|
||||||
if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); }
|
if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); }
|
||||||
if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); }
|
if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); }
|
||||||
|
if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
|
||||||
|
if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); }
|
||||||
}
|
}
|
||||||
tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
|
tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
|
||||||
}
|
}
|
||||||
|
|
|
@ -125,7 +125,7 @@ struct Arguments {
|
||||||
// Tuner-specific arguments
|
// Tuner-specific arguments
|
||||||
double fraction = 1.0;
|
double fraction = 1.0;
|
||||||
// Client-specific arguments
|
// Client-specific arguments
|
||||||
bool compare_clblas = 1;
|
int compare_clblas = 1;
|
||||||
size_t step = 1;
|
size_t step = 1;
|
||||||
size_t num_steps = 0;
|
size_t num_steps = 0;
|
||||||
size_t num_runs = 10;
|
size_t num_runs = 10;
|
||||||
|
@ -171,7 +171,8 @@ T GetArgument(const int argc, char *argv[], std::string &help,
|
||||||
const std::string &option, const T default_value);
|
const std::string &option, const T default_value);
|
||||||
|
|
||||||
// Returns the precision only
|
// Returns the precision only
|
||||||
Precision GetPrecision(const int argc, char *argv[]);
|
Precision GetPrecision(const int argc, char *argv[],
|
||||||
|
const Precision default_precision = Precision::kSingle);
|
||||||
|
|
||||||
// As in "GetArgument", but now only checks whether an argument is given or not
|
// As in "GetArgument", but now only checks whether an argument is given or not
|
||||||
bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option);
|
bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option);
|
||||||
|
|
|
@ -15,12 +15,36 @@ import os.path
|
||||||
import glob
|
import glob
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
try:
|
||||||
|
from urllib.request import urlopen # Python 3
|
||||||
|
except ImportError:
|
||||||
|
from urllib2 import urlopen # Python 2
|
||||||
|
|
||||||
# Additional modules
|
# Additional modules
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
# Server storing a copy of the database
|
||||||
|
DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db"
|
||||||
|
|
||||||
# Constants
|
# Constants
|
||||||
ATTRIBUTES = ["device", "type", "vendor", "precision", "kernel_family", "arg_m", "arg_n", "arg_k"]
|
VENDOR_DEFAULT = "default"
|
||||||
|
DEVICETYPE_DEFAULT = "All"
|
||||||
|
DEVICENAME_DEFAULT = "default"
|
||||||
|
|
||||||
|
# Attributes
|
||||||
|
DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"]
|
||||||
|
DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
|
||||||
|
KERNEL_ATTRIBUTES = ["precision", "kernel_family",
|
||||||
|
"arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
|
||||||
|
ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES
|
||||||
|
|
||||||
|
# OpenCL vendor names and their short name
|
||||||
|
VENDOR_NAMES = { "device_vendor": {
|
||||||
|
"GenuineIntel": "Intel",
|
||||||
|
"Intel(R) Corporation": "Intel",
|
||||||
|
"Advanced Micro Devices, Inc.": "AMD",
|
||||||
|
"NVIDIA Corporation": "NVIDIA",
|
||||||
|
}}
|
||||||
|
|
||||||
# Pandas options
|
# Pandas options
|
||||||
pd.set_option('display.width', 1000)
|
pd.set_option('display.width', 1000)
|
||||||
|
@ -29,6 +53,14 @@ pd.set_option('display.width', 1000)
|
||||||
# Database operations
|
# Database operations
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
|
# Downloads the database and save it to disk
|
||||||
|
def DownloadDatabase(filename):
|
||||||
|
print("## Downloading database from '"+DATABASE_SERVER_URL+"'...")
|
||||||
|
df = urlopen(DATABASE_SERVER_URL)
|
||||||
|
output = open(file_db,'wb')
|
||||||
|
output.write(df.read())
|
||||||
|
output.close()
|
||||||
|
|
||||||
# Loads the database from disk
|
# Loads the database from disk
|
||||||
def LoadDatabase(filename):
|
def LoadDatabase(filename):
|
||||||
return pd.read_pickle(filename)
|
return pd.read_pickle(filename)
|
||||||
|
@ -60,15 +92,58 @@ def ConcatenateData(df1, df2):
|
||||||
def RemoveDuplicates(df):
|
def RemoveDuplicates(df):
|
||||||
return df.drop_duplicates()
|
return df.drop_duplicates()
|
||||||
|
|
||||||
# Bests
|
def RemoveEntriesByDevice(df, devicename):
|
||||||
|
return df[df["device"] != devicename]
|
||||||
|
|
||||||
|
def GetEntriesByField(df, field, value):
|
||||||
|
return df[df[field] == value]
|
||||||
|
|
||||||
|
# Fixes the problem that some vendors use multiple different names
|
||||||
|
def SanitizeVendorNames(df):
|
||||||
|
df = df.replace(VENDOR_NAMES)
|
||||||
|
return df
|
||||||
|
|
||||||
|
# Retrieves the results with the lowest execution times
|
||||||
def GetBestResults(df):
|
def GetBestResults(df):
|
||||||
dfbest = pd.DataFrame()
|
dfbest = pd.DataFrame()
|
||||||
grouped = df.groupby(ATTRIBUTES+["kernel"])
|
grouped = df.groupby(ATTRIBUTES+["kernel"])
|
||||||
for name, dfgroup in grouped:
|
for name, dfgroup in grouped:
|
||||||
bestcase = dfgroup.loc[[dfgroup["time"].idxmin()]]
|
besttime = dfgroup["time"].min()
|
||||||
dfbest = ConcatenateData(dfbest, bestcase)
|
bestcase = dfgroup[dfgroup["time"] == besttime].iloc[0]
|
||||||
|
dfbest = dfbest.append(bestcase, ignore_index=True)
|
||||||
return dfbest
|
return dfbest
|
||||||
|
|
||||||
|
# Sets defaults for devices of the same type/vendor based on the smallest values of all know
|
||||||
|
# entries. The average might be better for performance but some parameters might not be supported
|
||||||
|
# on other devices.
|
||||||
|
def CalculateDefaults(df):
|
||||||
|
dfdefault = pd.DataFrame()
|
||||||
|
|
||||||
|
# Defaults per type/vendor
|
||||||
|
groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
|
||||||
|
for name, dfgroup in groups:
|
||||||
|
default_values = dfgroup.min(axis=0)
|
||||||
|
default_values["device"] = DEVICENAME_DEFAULT
|
||||||
|
default_values["device_compute_units"] = 0
|
||||||
|
default_values["device_core_clock"] = 0
|
||||||
|
default_values["time"] = 0.0
|
||||||
|
dfdefault = dfdefault.append(default_values, ignore_index=True)
|
||||||
|
|
||||||
|
# Defaults in general
|
||||||
|
groups = df.groupby(KERNEL_ATTRIBUTES+["kernel"])
|
||||||
|
for name, dfgroup in groups:
|
||||||
|
default_values = dfgroup.min(axis=0)
|
||||||
|
default_values["device_vendor"] = VENDOR_DEFAULT
|
||||||
|
default_values["device_type"] = DEVICETYPE_DEFAULT
|
||||||
|
default_values["device"] = DEVICENAME_DEFAULT
|
||||||
|
default_values["device_compute_units"] = 0
|
||||||
|
default_values["device_core_clock"] = 0
|
||||||
|
default_values["time"] = 0.0
|
||||||
|
dfdefault = dfdefault.append(default_values, ignore_index=True)
|
||||||
|
|
||||||
|
# Database with both types of defaults only
|
||||||
|
return dfdefault
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
# C++ header generation
|
# C++ header generation
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
@ -110,27 +185,28 @@ def GetPrecision(family, precision):
|
||||||
|
|
||||||
# The C++ device type and vendor
|
# The C++ device type and vendor
|
||||||
def GetDeviceVendor(vendor, devtype):
|
def GetDeviceVendor(vendor, devtype):
|
||||||
return(" { // %s %ss\n kDeviceType%s, kDeviceVendor%s, {\n"
|
if vendor == VENDOR_DEFAULT and devtype == DEVICETYPE_DEFAULT:
|
||||||
% (vendor, devtype, devtype, vendor))
|
return(" { // Default\n kDeviceType%s, \"%s\", {\n" % (devtype, vendor))
|
||||||
|
return(" { // %s %ss\n kDeviceType%s, \"%s\", {\n" % (vendor, devtype, devtype[0].upper() + devtype[1:], vendor))
|
||||||
|
|
||||||
# Prints the data to a C++ database
|
# Prints the data to a C++ database
|
||||||
def PrintData(df):
|
def PrintData(df, outputdir):
|
||||||
|
|
||||||
# Iterates over the kernel families: creates a new file per family
|
# Iterates over the kernel families: creates a new file per family
|
||||||
for family, dffamily in df.groupby(["kernel_family"]):
|
for family, dffamily in df.groupby(["kernel_family"]):
|
||||||
dffamily = dffamily.dropna(axis=1, how='all')
|
dffamily = dffamily.dropna(axis=1, how='all')
|
||||||
f = open(family+'.h', 'w+')
|
f = open(os.path.join(outputdir, family+'.h'), 'w+')
|
||||||
f.write(GetHeader(family))
|
f.write(GetHeader(family))
|
||||||
|
|
||||||
# Loops over the different entries for this family and prints their headers
|
# Loops over the different entries for this family and prints their headers
|
||||||
for precision, dfprecision in dffamily.groupby(["precision"]):
|
for precision, dfprecision in dffamily.groupby(["precision"]):
|
||||||
f.write(GetPrecision(family, precision))
|
f.write(GetPrecision(family, precision))
|
||||||
for vendor, dfvendor in dfprecision.groupby(["vendor"]):
|
for vendor, dfvendor in dfprecision.groupby(["device_vendor"]):
|
||||||
for devtype, dfdevtype in dfvendor.groupby(["type"]):
|
for devtype, dfdevtype in dfvendor.groupby(["device_type"]):
|
||||||
f.write(GetDeviceVendor(vendor, devtype))
|
f.write(GetDeviceVendor(vendor, devtype))
|
||||||
for device, dfdevice in dfdevtype.groupby(["device"]):
|
for device, dfdevice in dfdevtype.groupby(["device"]):
|
||||||
devicename = "\"%s\"," % device
|
devicename = "\"%s\"," % device
|
||||||
f.write(" { %-20s { " % devicename)
|
f.write(" { %-50s { " % devicename)
|
||||||
|
|
||||||
# Collects the paramaters for this case and prints them
|
# Collects the paramaters for this case and prints them
|
||||||
parameters = []
|
parameters = []
|
||||||
|
@ -152,57 +228,70 @@ def PrintData(df):
|
||||||
|
|
||||||
# Checks for the number of command-line arguments
|
# Checks for the number of command-line arguments
|
||||||
if len(sys.argv) != 3:
|
if len(sys.argv) != 3:
|
||||||
print "[ERROR] Usage: database.py <folder_with_json_files> <root_of_clblast>"
|
print("[ERROR] Usage: database.py <folder_with_json_files> <root_of_clblast>")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
# Parses the command-line arguments
|
# Parses the command-line arguments
|
||||||
path_json = sys.argv[1]
|
path_json = sys.argv[1]
|
||||||
path_clblast = sys.argv[2]
|
path_clblast = sys.argv[2]
|
||||||
file_db = path_clblast+"/src/database.db"
|
file_db = os.path.join(path_clblast, "scripts", "database", "database.db")
|
||||||
glob_json = path_json+"/*.json"
|
glob_json = os.path.join(path_json, "*.json")
|
||||||
|
|
||||||
# Checks whether the command-line arguments are valid; exists otherwise
|
# Checks whether the command-line arguments are valid; exists otherwise
|
||||||
clblast_h = path_clblast+"/include/clblast.h" # Not used but just for validation
|
clblast_h = os.path.join(path_clblast, "include", "clblast.h") # Not used but just for validation
|
||||||
if not os.path.isfile(clblast_h):
|
if not os.path.isfile(clblast_h):
|
||||||
print "[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library"
|
print("[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
if len(glob.glob(glob_json)) < 1:
|
if len(glob.glob(glob_json)) < 1:
|
||||||
print "[ERROR] The path '"+path_json+"' does not contain any JSON files"
|
print("## The path '"+path_json+"' does not contain any JSON files")
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
# The main body of the script
|
# The main body of the script
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Loads the database if it exists. If not, a new database is initialized
|
# Downloads the database if a local copy is not present
|
||||||
db_exists = os.path.isfile(file_db)
|
db_exists = os.path.isfile(file_db)
|
||||||
database = LoadDatabase(file_db) if db_exists else pd.DataFrame()
|
if not db_exists:
|
||||||
|
DownloadDatabase(file_db)
|
||||||
|
|
||||||
|
# Loads the database from disk
|
||||||
|
print("## Loading the database from disk...")
|
||||||
|
database = LoadDatabase(file_db)
|
||||||
|
|
||||||
# Loops over all JSON files in the supplied folder
|
# Loops over all JSON files in the supplied folder
|
||||||
for file_json in glob.glob(glob_json):
|
for file_json in glob.glob(glob_json):
|
||||||
|
|
||||||
# Loads the newly imported data
|
# Loads the newly imported data
|
||||||
print "## Processing '"+file_json+"'",
|
sys.stdout.write("## Processing '"+file_json+"' ")
|
||||||
imported_data = ImportDataFromFile(file_json)
|
imported_data = ImportDataFromFile(file_json)
|
||||||
|
imported_data = SanitizeVendorNames(imported_data)
|
||||||
|
|
||||||
# Adds the new data to the database
|
# Adds the new data to the database
|
||||||
old_size = len(database.index)
|
old_size = len(database.index)
|
||||||
database = ConcatenateData(database, imported_data)
|
database = ConcatenateData(database, imported_data)
|
||||||
database = RemoveDuplicates(database)
|
database = RemoveDuplicates(database)
|
||||||
new_size = len(database.index)
|
new_size = len(database.index)
|
||||||
print "with "+str(new_size-old_size)+" new items"
|
print("with "+str(new_size-old_size)+" new items")
|
||||||
|
|
||||||
# Stores the new database back to disk
|
|
||||||
SaveDatabase(database, file_db)
|
# Stores the modified database back to disk
|
||||||
|
if len(glob.glob(glob_json)) >= 1:
|
||||||
|
print("## Storing the database to disk...")
|
||||||
|
SaveDatabase(database, file_db)
|
||||||
|
|
||||||
# Retrieves the best performing results
|
# Retrieves the best performing results
|
||||||
|
print("## Calculating the best results per device/kernel...")
|
||||||
bests = GetBestResults(database)
|
bests = GetBestResults(database)
|
||||||
|
|
||||||
# TODO: Determines the defaults for other vendors and per vendor
|
# Determines the defaults for other vendors and per vendor
|
||||||
#defaults = CalculateDefaults(bests)
|
defaults = CalculateDefaults(bests)
|
||||||
#bests = ConcatenateData(bests, defaults)
|
bests = ConcatenateData(bests, defaults)
|
||||||
|
|
||||||
# Outputs the data as a C++ database
|
# Outputs the data as a C++ database
|
||||||
PrintData(bests)
|
path_cpp_database = os.path.join(path_clblast, "include", "internal", "database")
|
||||||
|
print("## Producing a C++ database in '"+path_cpp_database+"'...")
|
||||||
|
PrintData(bests, path_cpp_database)
|
||||||
|
|
||||||
|
print("## All done")
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
|
@ -78,17 +78,17 @@ routines = [
|
||||||
Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
|
Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
|
||||||
Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
|
Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
|
||||||
# Level 2: matrix update
|
# Level 2: matrix update
|
||||||
Routine(False, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
|
Routine(True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
|
||||||
Routine(False, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
|
Routine(True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
|
||||||
Routine(False, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
|
Routine(True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
|
||||||
Routine(False, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
|
Routine(True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
|
||||||
Routine(False, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
|
Routine(True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
|
||||||
Routine(False, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
|
Routine(True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
|
||||||
Routine(False, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
|
Routine(True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
|
||||||
Routine(False, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
|
Routine(True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
|
||||||
Routine(False, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
|
Routine(True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
|
||||||
Routine(False, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
|
Routine(True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
|
||||||
Routine(False, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
|
Routine(True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
|
||||||
],
|
],
|
||||||
[ # Level 3: matrix-matrix
|
[ # Level 3: matrix-matrix
|
||||||
Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"),
|
Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"),
|
||||||
|
@ -103,7 +103,17 @@ routines = [
|
||||||
]]
|
]]
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
# Translates an option name to a CLBlast data-type
|
||||||
|
def PrecisionToFullName(x):
|
||||||
|
return {
|
||||||
|
'H': "Half",
|
||||||
|
'S': "Single",
|
||||||
|
'D': "Double",
|
||||||
|
'C': "ComplexSingle",
|
||||||
|
'Z': "ComplexDouble",
|
||||||
|
}[x]
|
||||||
|
|
||||||
|
# ==================================================================================================
|
||||||
# Separators for the BLAS levels
|
# Separators for the BLAS levels
|
||||||
separators = ["""
|
separators = ["""
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -237,7 +247,7 @@ files = [
|
||||||
path_clblast+"/src/clblast_c.cc",
|
path_clblast+"/src/clblast_c.cc",
|
||||||
path_clblast+"/test/wrapper_clblas.h",
|
path_clblast+"/test/wrapper_clblas.h",
|
||||||
]
|
]
|
||||||
header_lines = [84, 52, 80, 24, 22]
|
header_lines = [84, 63, 80, 24, 22]
|
||||||
footer_lines = [6, 3, 5, 2, 6]
|
footer_lines = [6, 3, 5, 2, 6]
|
||||||
|
|
||||||
# Checks whether the command-line arguments are valid; exists otherwise
|
# Checks whether the command-line arguments are valid; exists otherwise
|
||||||
|
@ -315,16 +325,10 @@ for level in [1,2,3]:
|
||||||
body += "using double2 = clblast::double2;\n\n"
|
body += "using double2 = clblast::double2;\n\n"
|
||||||
body += "// Main function (not within the clblast namespace)\n"
|
body += "// Main function (not within the clblast namespace)\n"
|
||||||
body += "int main(int argc, char *argv[]) {\n"
|
body += "int main(int argc, char *argv[]) {\n"
|
||||||
body += " switch(clblast::GetPrecision(argc, argv)) {\n"
|
default = PrecisionToFullName(routine.flavours[0].name)
|
||||||
|
body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
|
||||||
for precision in ["H","S","D","C","Z"]:
|
for precision in ["H","S","D","C","Z"]:
|
||||||
enum = {
|
body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":"
|
||||||
'H': "Half",
|
|
||||||
'S': "Single",
|
|
||||||
'D': "Double",
|
|
||||||
'C': "ComplexSingle",
|
|
||||||
'Z': "ComplexDouble",
|
|
||||||
}[precision]
|
|
||||||
body += " case clblast::Precision::k"+enum+":"
|
|
||||||
found = False
|
found = False
|
||||||
for flavour in routine.flavours:
|
for flavour in routine.flavours:
|
||||||
if flavour.name == precision:
|
if flavour.name == precision:
|
||||||
|
|
285
src/clblast.cc
285
src/clblast.cc
|
@ -38,6 +38,17 @@
|
||||||
#include "internal/routines/level2/xtrmv.h"
|
#include "internal/routines/level2/xtrmv.h"
|
||||||
#include "internal/routines/level2/xtbmv.h"
|
#include "internal/routines/level2/xtbmv.h"
|
||||||
#include "internal/routines/level2/xtpmv.h"
|
#include "internal/routines/level2/xtpmv.h"
|
||||||
|
#include "internal/routines/level2/xger.h"
|
||||||
|
#include "internal/routines/level2/xgeru.h"
|
||||||
|
#include "internal/routines/level2/xgerc.h"
|
||||||
|
#include "internal/routines/level2/xher.h"
|
||||||
|
#include "internal/routines/level2/xhpr.h"
|
||||||
|
#include "internal/routines/level2/xher2.h"
|
||||||
|
#include "internal/routines/level2/xhpr2.h"
|
||||||
|
#include "internal/routines/level2/xsyr.h"
|
||||||
|
#include "internal/routines/level2/xspr.h"
|
||||||
|
#include "internal/routines/level2/xsyr2.h"
|
||||||
|
#include "internal/routines/level2/xspr2.h"
|
||||||
|
|
||||||
// BLAS level-3 includes
|
// BLAS level-3 includes
|
||||||
#include "internal/routines/level3/xgemm.h"
|
#include "internal/routines/level3/xgemm.h"
|
||||||
|
@ -835,14 +846,24 @@ template StatusCode Tpsv<double2>(const Layout, const Triangle, const Transpose,
|
||||||
|
|
||||||
// General rank-1 matrix update: SGER/DGER
|
// General rank-1 matrix update: SGER/DGER
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Ger(const Layout,
|
StatusCode Ger(const Layout layout,
|
||||||
const size_t, const size_t,
|
const size_t m, const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem, const size_t, const size_t,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xger<T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoGer(layout,
|
||||||
|
m, n,
|
||||||
|
alpha,
|
||||||
|
Buffer<T>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<T>(y_buffer), y_offset, y_inc,
|
||||||
|
Buffer<T>(a_buffer), a_offset, a_ld);
|
||||||
}
|
}
|
||||||
template StatusCode Ger<float>(const Layout,
|
template StatusCode Ger<float>(const Layout,
|
||||||
const size_t, const size_t,
|
const size_t, const size_t,
|
||||||
|
@ -861,14 +882,24 @@ template StatusCode Ger<double>(const Layout,
|
||||||
|
|
||||||
// General rank-1 complex matrix update: CGERU/ZGERU
|
// General rank-1 complex matrix update: CGERU/ZGERU
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Geru(const Layout,
|
StatusCode Geru(const Layout layout,
|
||||||
const size_t, const size_t,
|
const size_t m, const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem, const size_t, const size_t,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xgeru<T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoGeru(layout,
|
||||||
|
m, n,
|
||||||
|
alpha,
|
||||||
|
Buffer<T>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<T>(y_buffer), y_offset, y_inc,
|
||||||
|
Buffer<T>(a_buffer), a_offset, a_ld);
|
||||||
}
|
}
|
||||||
template StatusCode Geru<float2>(const Layout,
|
template StatusCode Geru<float2>(const Layout,
|
||||||
const size_t, const size_t,
|
const size_t, const size_t,
|
||||||
|
@ -887,14 +918,24 @@ template StatusCode Geru<double2>(const Layout,
|
||||||
|
|
||||||
// General rank-1 complex conjugated matrix update: CGERC/ZGERC
|
// General rank-1 complex conjugated matrix update: CGERC/ZGERC
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Gerc(const Layout,
|
StatusCode Gerc(const Layout layout,
|
||||||
const size_t, const size_t,
|
const size_t m, const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem, const size_t, const size_t,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xgerc<T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoGerc(layout,
|
||||||
|
m, n,
|
||||||
|
alpha,
|
||||||
|
Buffer<T>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<T>(y_buffer), y_offset, y_inc,
|
||||||
|
Buffer<T>(a_buffer), a_offset, a_ld);
|
||||||
}
|
}
|
||||||
template StatusCode Gerc<float2>(const Layout,
|
template StatusCode Gerc<float2>(const Layout,
|
||||||
const size_t, const size_t,
|
const size_t, const size_t,
|
||||||
|
@ -913,13 +954,22 @@ template StatusCode Gerc<double2>(const Layout,
|
||||||
|
|
||||||
// Hermitian rank-1 matrix update: CHER/ZHER
|
// Hermitian rank-1 matrix update: CHER/ZHER
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Her(const Layout, const Triangle,
|
StatusCode Her(const Layout layout, const Triangle triangle,
|
||||||
const size_t,
|
const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem, const size_t, const size_t,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xher<std::complex<T>,T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoHer(layout, triangle,
|
||||||
|
n,
|
||||||
|
alpha,
|
||||||
|
Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<std::complex<T>>(a_buffer), a_offset, a_ld);
|
||||||
}
|
}
|
||||||
template StatusCode Her<float>(const Layout, const Triangle,
|
template StatusCode Her<float>(const Layout, const Triangle,
|
||||||
const size_t,
|
const size_t,
|
||||||
|
@ -936,13 +986,22 @@ template StatusCode Her<double>(const Layout, const Triangle,
|
||||||
|
|
||||||
// Hermitian packed rank-1 matrix update: CHPR/ZHPR
|
// Hermitian packed rank-1 matrix update: CHPR/ZHPR
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Hpr(const Layout, const Triangle,
|
StatusCode Hpr(const Layout layout, const Triangle triangle,
|
||||||
const size_t,
|
const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem, const size_t,
|
cl_mem ap_buffer, const size_t ap_offset,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoHpr(layout, triangle,
|
||||||
|
n,
|
||||||
|
alpha,
|
||||||
|
Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<std::complex<T>>(ap_buffer), ap_offset);
|
||||||
}
|
}
|
||||||
template StatusCode Hpr<float>(const Layout, const Triangle,
|
template StatusCode Hpr<float>(const Layout, const Triangle,
|
||||||
const size_t,
|
const size_t,
|
||||||
|
@ -959,14 +1018,24 @@ template StatusCode Hpr<double>(const Layout, const Triangle,
|
||||||
|
|
||||||
// Hermitian rank-2 matrix update: CHER2/ZHER2
|
// Hermitian rank-2 matrix update: CHER2/ZHER2
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Her2(const Layout, const Triangle,
|
StatusCode Her2(const Layout layout, const Triangle triangle,
|
||||||
const size_t,
|
const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem, const size_t, const size_t,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xher2<T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoHer2(layout, triangle,
|
||||||
|
n,
|
||||||
|
alpha,
|
||||||
|
Buffer<T>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<T>(y_buffer), y_offset, y_inc,
|
||||||
|
Buffer<T>(a_buffer), a_offset, a_ld);
|
||||||
}
|
}
|
||||||
template StatusCode Her2<float2>(const Layout, const Triangle,
|
template StatusCode Her2<float2>(const Layout, const Triangle,
|
||||||
const size_t,
|
const size_t,
|
||||||
|
@ -985,14 +1054,24 @@ template StatusCode Her2<double2>(const Layout, const Triangle,
|
||||||
|
|
||||||
// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
|
// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Hpr2(const Layout, const Triangle,
|
StatusCode Hpr2(const Layout layout, const Triangle triangle,
|
||||||
const size_t,
|
const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem, const size_t,
|
cl_mem ap_buffer, const size_t ap_offset,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xhpr2<T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoHpr2(layout, triangle,
|
||||||
|
n,
|
||||||
|
alpha,
|
||||||
|
Buffer<T>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<T>(y_buffer), y_offset, y_inc,
|
||||||
|
Buffer<T>(ap_buffer), ap_offset);
|
||||||
}
|
}
|
||||||
template StatusCode Hpr2<float2>(const Layout, const Triangle,
|
template StatusCode Hpr2<float2>(const Layout, const Triangle,
|
||||||
const size_t,
|
const size_t,
|
||||||
|
@ -1011,13 +1090,22 @@ template StatusCode Hpr2<double2>(const Layout, const Triangle,
|
||||||
|
|
||||||
// Symmetric rank-1 matrix update: SSYR/DSYR
|
// Symmetric rank-1 matrix update: SSYR/DSYR
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Syr(const Layout, const Triangle,
|
StatusCode Syr(const Layout layout, const Triangle triangle,
|
||||||
const size_t,
|
const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem, const size_t, const size_t,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xsyr<T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoSyr(layout, triangle,
|
||||||
|
n,
|
||||||
|
alpha,
|
||||||
|
Buffer<T>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<T>(a_buffer), a_offset, a_ld);
|
||||||
}
|
}
|
||||||
template StatusCode Syr<float>(const Layout, const Triangle,
|
template StatusCode Syr<float>(const Layout, const Triangle,
|
||||||
const size_t,
|
const size_t,
|
||||||
|
@ -1034,13 +1122,22 @@ template StatusCode Syr<double>(const Layout, const Triangle,
|
||||||
|
|
||||||
// Symmetric packed rank-1 matrix update: SSPR/DSPR
|
// Symmetric packed rank-1 matrix update: SSPR/DSPR
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Spr(const Layout, const Triangle,
|
StatusCode Spr(const Layout layout, const Triangle triangle,
|
||||||
const size_t,
|
const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem, const size_t,
|
cl_mem ap_buffer, const size_t ap_offset,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xspr<T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoSpr(layout, triangle,
|
||||||
|
n,
|
||||||
|
alpha,
|
||||||
|
Buffer<T>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<T>(ap_buffer), ap_offset);
|
||||||
}
|
}
|
||||||
template StatusCode Spr<float>(const Layout, const Triangle,
|
template StatusCode Spr<float>(const Layout, const Triangle,
|
||||||
const size_t,
|
const size_t,
|
||||||
|
@ -1057,14 +1154,24 @@ template StatusCode Spr<double>(const Layout, const Triangle,
|
||||||
|
|
||||||
// Symmetric rank-2 matrix update: SSYR2/DSYR2
|
// Symmetric rank-2 matrix update: SSYR2/DSYR2
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Syr2(const Layout, const Triangle,
|
StatusCode Syr2(const Layout layout, const Triangle triangle,
|
||||||
const size_t,
|
const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem, const size_t, const size_t,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xsyr2<T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoSyr2(layout, triangle,
|
||||||
|
n,
|
||||||
|
alpha,
|
||||||
|
Buffer<T>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<T>(y_buffer), y_offset, y_inc,
|
||||||
|
Buffer<T>(a_buffer), a_offset, a_ld);
|
||||||
}
|
}
|
||||||
template StatusCode Syr2<float>(const Layout, const Triangle,
|
template StatusCode Syr2<float>(const Layout, const Triangle,
|
||||||
const size_t,
|
const size_t,
|
||||||
|
@ -1083,14 +1190,24 @@ template StatusCode Syr2<double>(const Layout, const Triangle,
|
||||||
|
|
||||||
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
|
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Spr2(const Layout, const Triangle,
|
StatusCode Spr2(const Layout layout, const Triangle triangle,
|
||||||
const size_t,
|
const size_t n,
|
||||||
const T,
|
const T alpha,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem, const size_t, const size_t,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem, const size_t,
|
cl_mem ap_buffer, const size_t ap_offset,
|
||||||
cl_command_queue*, cl_event*) {
|
cl_command_queue* queue, cl_event* event) {
|
||||||
return StatusCode::kNotImplemented;
|
auto queue_cpp = Queue(*queue);
|
||||||
|
auto event_cpp = Event(*event);
|
||||||
|
auto routine = Xspr2<T>(queue_cpp, event_cpp);
|
||||||
|
auto status = routine.SetUp();
|
||||||
|
if (status != StatusCode::kSuccess) { return status; }
|
||||||
|
return routine.DoSpr2(layout, triangle,
|
||||||
|
n,
|
||||||
|
alpha,
|
||||||
|
Buffer<T>(x_buffer), x_offset, x_inc,
|
||||||
|
Buffer<T>(y_buffer), y_offset, y_inc,
|
||||||
|
Buffer<T>(ap_buffer), ap_offset);
|
||||||
}
|
}
|
||||||
template StatusCode Spr2<float>(const Layout, const Triangle,
|
template StatusCode Spr2<float>(const Layout, const Triangle,
|
||||||
const size_t,
|
const size_t,
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include "internal/database/xaxpy.h"
|
#include "internal/database/xaxpy.h"
|
||||||
#include "internal/database/xdot.h"
|
#include "internal/database/xdot.h"
|
||||||
#include "internal/database/xgemv.h"
|
#include "internal/database/xgemv.h"
|
||||||
|
#include "internal/database/xger.h"
|
||||||
#include "internal/database/xgemm.h"
|
#include "internal/database/xgemm.h"
|
||||||
#include "internal/database/copy.h"
|
#include "internal/database/copy.h"
|
||||||
#include "internal/database/pad.h"
|
#include "internal/database/pad.h"
|
||||||
|
@ -31,11 +32,12 @@ const std::vector<Database::DatabaseEntry> Database::database = {
|
||||||
XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
|
XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
|
||||||
XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
|
XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
|
||||||
XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
|
XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
|
||||||
|
XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
|
||||||
XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
|
XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
|
||||||
CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
|
CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
|
||||||
PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
|
PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
|
||||||
TraSingle, TraDouble, TraComplexSingle, TraComplexDouble,
|
TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
|
||||||
PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble
|
PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
|
||||||
};
|
};
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -77,19 +79,29 @@ Database::Parameters Database::Search(const std::string &this_kernel,
|
||||||
const std::string &this_vendor,
|
const std::string &this_vendor,
|
||||||
const std::string &this_device,
|
const std::string &this_device,
|
||||||
const Precision this_precision) const {
|
const Precision this_precision) const {
|
||||||
|
// Set the short vendor name
|
||||||
|
auto this_short_vendor = this_vendor;
|
||||||
|
for (auto &combination : kVendorNames) {
|
||||||
|
if (this_vendor == combination.first) {
|
||||||
|
this_short_vendor = combination.second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Selects the right kernel
|
||||||
for (auto &db: database) {
|
for (auto &db: database) {
|
||||||
if (db.kernel == this_kernel && db.precision == this_precision) {
|
if (db.kernel == this_kernel && db.precision == this_precision) {
|
||||||
|
|
||||||
// Searches for the right vendor and device type, or selects the default if unavailable. This
|
// Searches for the right vendor and device type, or selects the default if unavailable. This
|
||||||
// assumes that the default vendor / device type is last in the database.
|
// assumes that the default vendor / device type is last in the database.
|
||||||
for (auto &vendor: db.vendors) {
|
for (auto &vendor: db.vendors) {
|
||||||
if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
|
if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) &&
|
||||||
(vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
|
(vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
|
||||||
|
|
||||||
// Searches for the right device. If the current device is unavailable, selects the vendor
|
// Searches for the right device. If the current device is unavailable, selects the vendor
|
||||||
// default parameters. This assumes the default is last in the database.
|
// default parameters. This assumes the default is last in the database.
|
||||||
for (auto &device: vendor.devices) {
|
for (auto &device: vendor.devices) {
|
||||||
if (device.name == this_device || device.name == kDefaultDevice) {
|
|
||||||
|
if (device.name == this_device || device.name == "default") {
|
||||||
|
|
||||||
// Sets the parameters accordingly
|
// Sets the parameters accordingly
|
||||||
return device.parameters;
|
return device.parameters;
|
||||||
|
|
158
src/kernels/level2/level2.opencl
Normal file
158
src/kernels/level2/level2.opencl
Normal file
|
@ -0,0 +1,158 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file contains common functions for matrix update kernels (Xger, Xher).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||||
|
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||||
|
R"(
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
||||||
|
// this kernel file is used outside of the CLBlast library.
|
||||||
|
|
||||||
|
#ifndef WGS1
|
||||||
|
#define WGS1 8 // The local work-group size in first dimension
|
||||||
|
#endif
|
||||||
|
#ifndef WGS2
|
||||||
|
#define WGS2 8 // The local work-group size in second dimension
|
||||||
|
#endif
|
||||||
|
#ifndef WPT
|
||||||
|
#define WPT 1 // The amount of work-per-thread in both dimensions
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Returns an element from a vector
|
||||||
|
inline real LoadVector(const int id, const int max,
|
||||||
|
__global real* gm, const int offset, const int inc,
|
||||||
|
const int do_conjugate) {
|
||||||
|
if (id < max) {
|
||||||
|
real result = gm[id*inc + offset];
|
||||||
|
if (do_conjugate) {
|
||||||
|
#if defined(ROUTINE_GERC) || defined(ROUTINE_HER) || defined(ROUTINE_HPR) || defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
|
||||||
|
COMPLEX_CONJUGATE(result);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
real default_result;
|
||||||
|
SetToZero(default_result);
|
||||||
|
return default_result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Performs the rank-1 matrix update
|
||||||
|
inline void MatrixUpdate(const int id1, const int id2, const int max1, const int max2,
|
||||||
|
__global real* agm, const int a_offset, const int a_ld,
|
||||||
|
const real alpha, const real xvalue, const real yvalue,
|
||||||
|
const int is_upper) {
|
||||||
|
|
||||||
|
// Bounds of a regular matrix
|
||||||
|
if (id1 < max1 && id2 < max2) {
|
||||||
|
|
||||||
|
#if defined(ROUTINE_SPR) || defined(ROUTINE_HPR)
|
||||||
|
int a_index;
|
||||||
|
if (is_upper) {
|
||||||
|
a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2;
|
||||||
|
}
|
||||||
|
a_index += a_offset;
|
||||||
|
#else
|
||||||
|
const int a_index = id2*a_ld + id1 + a_offset;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Loads the current value of the A matrix
|
||||||
|
const real avalue = agm[a_index];
|
||||||
|
|
||||||
|
// Computes result = alpha * x[i] * y[j] + a[i][j]
|
||||||
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
|
real ax;
|
||||||
|
ax.x = MulReal(alpha, xvalue);
|
||||||
|
ax.y = MulImag(alpha, xvalue);
|
||||||
|
real result;
|
||||||
|
result.x = MulReal(ax, yvalue) + avalue.x;
|
||||||
|
result.y = MulImag(ax, yvalue) + avalue.y;
|
||||||
|
#else
|
||||||
|
real result = alpha * xvalue * yvalue + avalue;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// For hermetian matrices
|
||||||
|
#if defined(ROUTINE_HER) || defined(ROUTINE_HPR)
|
||||||
|
if (id1 == id2) { result.y = ZERO; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Stores the final result
|
||||||
|
agm[a_index] = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Performs the rank-2 matrix update
|
||||||
|
inline void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2,
|
||||||
|
__global real* agm, const int a_offset, const int a_ld,
|
||||||
|
const real alpha1, const real xvalue, const real yvalue,
|
||||||
|
const real alpha2, const real xtvalue, const real ytvalue,
|
||||||
|
const int is_upper) {
|
||||||
|
|
||||||
|
// Bounds of a regular matrix
|
||||||
|
if (id1 < max1 && id2 < max2) {
|
||||||
|
|
||||||
|
#if defined(ROUTINE_SPR2) || defined(ROUTINE_HPR2)
|
||||||
|
int a_index;
|
||||||
|
if (is_upper) {
|
||||||
|
a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2;
|
||||||
|
}
|
||||||
|
a_index += a_offset;
|
||||||
|
#else
|
||||||
|
const int a_index = id2*a_ld + id1 + a_offset;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Loads the current value of the A matrix
|
||||||
|
const real avalue = agm[a_index];
|
||||||
|
|
||||||
|
// Computes result = alpha * x[i] * y[j] + alpha * x[j] * y[i] + a[i][j]
|
||||||
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
|
real ax;
|
||||||
|
ax.x = MulReal(alpha2, xvalue);
|
||||||
|
ax.y = MulImag(alpha2, xvalue);
|
||||||
|
real atx;
|
||||||
|
atx.x = MulReal(alpha1, xtvalue);
|
||||||
|
atx.y = MulImag(alpha1, xtvalue);
|
||||||
|
real result;
|
||||||
|
result.x = MulReal(ax, yvalue) + MulReal(atx, ytvalue) + avalue.x;
|
||||||
|
result.y = MulImag(ax, yvalue) + MulImag(atx, ytvalue) + avalue.y;
|
||||||
|
#else
|
||||||
|
real result = alpha1 * xvalue * yvalue + alpha2 * xtvalue * ytvalue + avalue;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// For hermetian matrices
|
||||||
|
#if defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
|
||||||
|
if (id1 == id2) { result.y = ZERO; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Stores the final result
|
||||||
|
agm[a_index] = result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// End of the C++11 raw string literal
|
||||||
|
)"
|
||||||
|
|
||||||
|
// =================================================================================================
|
|
@ -7,7 +7,7 @@
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
//
|
//
|
||||||
// This file contains the Xgemv kernel for matrix-vector multiplication.
|
// This file contains the Xgemv kernel (generic version) for matrix-vector multiplication.
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -27,56 +27,11 @@ R"(
|
||||||
#ifndef WPT1
|
#ifndef WPT1
|
||||||
#define WPT1 1 // The amount of work-per-thread
|
#define WPT1 1 // The amount of work-per-thread
|
||||||
#endif
|
#endif
|
||||||
|
#ifndef UNROLL1
|
||||||
// 2: For the fast version
|
#define UNROLL1 32 // Unroll factor (must be a divider of WGS1)
|
||||||
#ifndef WGS2
|
|
||||||
#define WGS2 64 // The local work-group size
|
|
||||||
#endif
|
|
||||||
#ifndef WPT2
|
|
||||||
#define WPT2 1 // The amount of work-per-thread
|
|
||||||
#endif
|
|
||||||
#ifndef VW2
|
|
||||||
#define VW2 1 // Vector width of matrix A loads
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 3: For the fast rotated version
|
// 2 and 3: For the fast versions, see 'xgemv_fast.opencl'
|
||||||
#ifndef WGS3
|
|
||||||
#define WGS3 64 // The local work-group size
|
|
||||||
#endif
|
|
||||||
#ifndef WPT3
|
|
||||||
#define WPT3 1 // The amount of work-per-thread
|
|
||||||
#endif
|
|
||||||
#ifndef VW3
|
|
||||||
#define VW3 1 // Vector width of matrix A loads
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Data-widths for the 'fast' kernel
|
|
||||||
#if VW2 == 1
|
|
||||||
typedef real realVF;
|
|
||||||
#elif VW2 == 2
|
|
||||||
typedef real2 realVF;
|
|
||||||
#elif VW2 == 4
|
|
||||||
typedef real4 realVF;
|
|
||||||
#elif VW2 == 8
|
|
||||||
typedef real8 realVF;
|
|
||||||
#elif VW2 == 16
|
|
||||||
typedef real16 realVF;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Data-widths for the 'fast' kernel with rotated matrix
|
|
||||||
#if VW3 == 1
|
|
||||||
typedef real realVFR;
|
|
||||||
#elif VW3 == 2
|
|
||||||
typedef real2 realVFR;
|
|
||||||
#elif VW3 == 4
|
|
||||||
typedef real4 realVFR;
|
|
||||||
#elif VW3 == 8
|
|
||||||
typedef real8 realVFR;
|
|
||||||
#elif VW3 == 16
|
|
||||||
typedef real16 realVFR;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -252,18 +207,6 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loads a vector input value (1/2)
|
|
||||||
inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
|
|
||||||
const int a_ld) {
|
|
||||||
return agm[a_ld*y + x];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Loads a vector input value (2/2): as before, but different data-type
|
|
||||||
inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
|
|
||||||
const int a_ld) {
|
|
||||||
return agm[a_ld*y + x];
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Full version of the kernel
|
// Full version of the kernel
|
||||||
|
@ -301,28 +244,31 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// Loops over the work per thread, and checks whether in bounds
|
// Loops over the work per thread, and checks whether in bounds
|
||||||
#pragma unroll
|
|
||||||
for (int w=0; w<WPT1; ++w) {
|
for (int w=0; w<WPT1; ++w) {
|
||||||
const int gid = w*get_global_size(0) + get_global_id(0);
|
const int gid = w*get_global_size(0) + get_global_id(0);
|
||||||
if (gid < m) {
|
if (gid < m) {
|
||||||
|
|
||||||
// The multiply-add function for the main part (divisable by WGS1)
|
// The multiply-add function for the main part (divisable by WGS1)
|
||||||
if (a_rotated == 0) { // Not rotated
|
if (a_rotated == 0) { // Not rotated
|
||||||
#pragma unroll
|
for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) {
|
||||||
for (int kloop=0; kloop<WGS1; ++kloop) {
|
#pragma unroll
|
||||||
const int k = kwg + kloop;
|
for (int kunroll=0; kunroll<UNROLL1; ++kunroll) {
|
||||||
real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
|
const int k = kwg + kloop + kunroll;
|
||||||
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
|
real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
|
||||||
MultiplyAdd(acc[w], xlm[kloop], value);
|
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
|
||||||
|
MultiplyAdd(acc[w], xlm[kloop + kunroll], value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else { // Transposed
|
else { // Transposed
|
||||||
#pragma unroll
|
for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) {
|
||||||
for (int kloop=0; kloop<WGS1; ++kloop) {
|
#pragma unroll
|
||||||
const int k = kwg + kloop;
|
for (int kunroll=0; kunroll<UNROLL1; ++kunroll) {
|
||||||
real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
|
const int k = kwg + kloop + kunroll;
|
||||||
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
|
real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
|
||||||
MultiplyAdd(acc[w], xlm[kloop], value);
|
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
|
||||||
|
MultiplyAdd(acc[w], xlm[kloop + kunroll], value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -365,200 +311,6 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Faster version of the kernel, assuming that:
|
|
||||||
// --> 'm' and 'n' are multiples of WGS2
|
|
||||||
// --> 'a_offset' is 0
|
|
||||||
// --> 'a_ld' is a multiple of VW2
|
|
||||||
// --> 'a_rotated' is 0
|
|
||||||
// --> 'do_conjugate' is 0
|
|
||||||
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
|
||||||
__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
|
|
||||||
const int a_rotated,
|
|
||||||
const __global realVF* restrict agm, const int a_offset, const int a_ld,
|
|
||||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
|
||||||
__global real* ygm, const int y_offset, const int y_inc,
|
|
||||||
const int do_conjugate, const int parameter,
|
|
||||||
const int kl, const int ku) {
|
|
||||||
// Local memory for the vector X
|
|
||||||
__local real xlm[WGS2];
|
|
||||||
|
|
||||||
// Initializes the accumulation register
|
|
||||||
real acc[WPT2];
|
|
||||||
#pragma unroll
|
|
||||||
for (int w=0; w<WPT2; ++w) {
|
|
||||||
SetToZero(acc[w]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Loops over work-group sized portions of the work
|
|
||||||
for (int kwg=0; kwg<n; kwg+=WGS2) {
|
|
||||||
|
|
||||||
// Loads the vector X into local memory
|
|
||||||
const int lid = get_local_id(0);
|
|
||||||
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
|
|
||||||
|
|
||||||
// Synchronizes all threads in a workgroup
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
// The multiply-add function (not rotated)
|
|
||||||
#pragma unroll
|
|
||||||
for (int kl=0; kl<WGS2; ++kl) {
|
|
||||||
const int k = kwg + kl;
|
|
||||||
#pragma unroll
|
|
||||||
for (int w=0; w<WPT2/VW2; ++w) {
|
|
||||||
const int gid = (WPT2/VW2)*get_global_id(0) + w;
|
|
||||||
realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
|
|
||||||
#if VW2 == 1
|
|
||||||
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
|
|
||||||
#elif VW2 == 2
|
|
||||||
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
|
|
||||||
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
|
|
||||||
#elif VW2 == 4
|
|
||||||
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
|
|
||||||
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
|
|
||||||
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.z);
|
|
||||||
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.w);
|
|
||||||
#elif VW2 == 8
|
|
||||||
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
|
|
||||||
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
|
|
||||||
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
|
|
||||||
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
|
|
||||||
MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
|
|
||||||
MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
|
|
||||||
MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
|
|
||||||
MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
|
|
||||||
#elif VW2 == 16
|
|
||||||
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
|
|
||||||
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
|
|
||||||
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
|
|
||||||
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
|
|
||||||
MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
|
|
||||||
MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
|
|
||||||
MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
|
|
||||||
MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
|
|
||||||
MultiplyAdd(acc[VW2*w+8], xlm[kl], avec.s8);
|
|
||||||
MultiplyAdd(acc[VW2*w+9], xlm[kl], avec.s9);
|
|
||||||
MultiplyAdd(acc[VW2*w+10], xlm[kl], avec.sA);
|
|
||||||
MultiplyAdd(acc[VW2*w+11], xlm[kl], avec.sB);
|
|
||||||
MultiplyAdd(acc[VW2*w+12], xlm[kl], avec.sC);
|
|
||||||
MultiplyAdd(acc[VW2*w+13], xlm[kl], avec.sD);
|
|
||||||
MultiplyAdd(acc[VW2*w+14], xlm[kl], avec.sE);
|
|
||||||
MultiplyAdd(acc[VW2*w+15], xlm[kl], avec.sF);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Synchronizes all threads in a workgroup
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stores the final result
|
|
||||||
#pragma unroll
|
|
||||||
for (int w=0; w<WPT2; ++w) {
|
|
||||||
const int gid = WPT2*get_global_id(0) + w;
|
|
||||||
real yval = ygm[gid*y_inc + y_offset];
|
|
||||||
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Faster version of the kernel, assuming that:
|
|
||||||
// --> 'm' and 'n' are multiples of WGS3
|
|
||||||
// --> 'a_offset' is 0
|
|
||||||
// --> 'a_ld' is a multiple of VW3
|
|
||||||
// --> 'a_rotated' is 1
|
|
||||||
// --> 'do_conjugate' is 0
|
|
||||||
__attribute__((reqd_work_group_size(WGS3, 1, 1)))
|
|
||||||
__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
|
|
||||||
const int a_rotated,
|
|
||||||
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
|
|
||||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
|
||||||
__global real* ygm, const int y_offset, const int y_inc,
|
|
||||||
const int do_conjugate, const int parameter,
|
|
||||||
const int kl, const int ku) {
|
|
||||||
// Local memory for the vector X
|
|
||||||
__local real xlm[WGS3];
|
|
||||||
|
|
||||||
// Initializes the accumulation register
|
|
||||||
real acc[WPT3];
|
|
||||||
#pragma unroll
|
|
||||||
for (int w=0; w<WPT3; ++w) {
|
|
||||||
SetToZero(acc[w]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Loops over work-group sized portions of the work
|
|
||||||
for (int kwg=0; kwg<n; kwg+=WGS3) {
|
|
||||||
|
|
||||||
// Loads the vector X into local memory
|
|
||||||
const int lid = get_local_id(0);
|
|
||||||
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
|
|
||||||
|
|
||||||
// Synchronizes all threads in a workgroup
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
// The multiply-add function (rotated)
|
|
||||||
#pragma unroll
|
|
||||||
for (int kl=0; kl<WGS3/VW3; ++kl) {
|
|
||||||
const int k = (kwg/VW3) + kl;
|
|
||||||
#pragma unroll
|
|
||||||
for (int w=0; w<WPT3; ++w) {
|
|
||||||
const int gid = WPT3*get_global_id(0) + w;
|
|
||||||
realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
|
|
||||||
#if VW3 == 1
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
|
|
||||||
#elif VW3 == 2
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
|
|
||||||
#elif VW3 == 4
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
|
|
||||||
#elif VW3 == 8
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
|
|
||||||
#elif VW3 == 16
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
|
|
||||||
MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Synchronizes all threads in a workgroup
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stores the final result
|
|
||||||
#pragma unroll
|
|
||||||
for (int w=0; w<WPT3; ++w) {
|
|
||||||
const int gid = WPT3*get_global_id(0) + w;
|
|
||||||
real yval = ygm[gid*y_inc + y_offset];
|
|
||||||
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// End of the C++11 raw string literal
|
// End of the C++11 raw string literal
|
||||||
)"
|
)"
|
||||||
|
|
||||||
|
|
288
src/kernels/level2/xgemv_fast.opencl
Normal file
288
src/kernels/level2/xgemv_fast.opencl
Normal file
|
@ -0,0 +1,288 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file contains the Xgemv kernel (fast versions) for matrix-vector multiplication.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||||
|
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||||
|
R"(
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
||||||
|
// this kernel file is used outside of the CLBlast library.
|
||||||
|
|
||||||
|
// 1: For the full version, see 'xgemv.opencl'
|
||||||
|
|
||||||
|
// 2: For the fast version
|
||||||
|
#ifndef WGS2
|
||||||
|
#define WGS2 64 // The local work-group size
|
||||||
|
#endif
|
||||||
|
#ifndef WPT2
|
||||||
|
#define WPT2 1 // The amount of work-per-thread
|
||||||
|
#endif
|
||||||
|
#ifndef VW2
|
||||||
|
#define VW2 1 // Vector width of matrix A loads
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// 3: For the fast rotated version
|
||||||
|
#ifndef WGS3
|
||||||
|
#define WGS3 64 // The local work-group size
|
||||||
|
#endif
|
||||||
|
#ifndef WPT3
|
||||||
|
#define WPT3 1 // The amount of work-per-thread
|
||||||
|
#endif
|
||||||
|
#ifndef VW3
|
||||||
|
#define VW3 1 // Vector width of matrix A loads
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Data-widths for the 'fast' kernel
|
||||||
|
#if VW2 == 1
|
||||||
|
typedef real realVF;
|
||||||
|
#elif VW2 == 2
|
||||||
|
typedef real2 realVF;
|
||||||
|
#elif VW2 == 4
|
||||||
|
typedef real4 realVF;
|
||||||
|
#elif VW2 == 8
|
||||||
|
typedef real8 realVF;
|
||||||
|
#elif VW2 == 16
|
||||||
|
typedef real16 realVF;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Data-widths for the 'fast' kernel with rotated matrix
|
||||||
|
#if VW3 == 1
|
||||||
|
typedef real realVFR;
|
||||||
|
#elif VW3 == 2
|
||||||
|
typedef real2 realVFR;
|
||||||
|
#elif VW3 == 4
|
||||||
|
typedef real4 realVFR;
|
||||||
|
#elif VW3 == 8
|
||||||
|
typedef real8 realVFR;
|
||||||
|
#elif VW3 == 16
|
||||||
|
typedef real16 realVFR;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Loads a vector input value (1/2)
|
||||||
|
inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
|
||||||
|
const int a_ld) {
|
||||||
|
return agm[a_ld*y + x];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loads a vector input value (2/2): as before, but different data-type
|
||||||
|
inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
|
||||||
|
const int a_ld) {
|
||||||
|
return agm[a_ld*y + x];
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Faster version of the kernel, assuming that:
|
||||||
|
// --> 'm' and 'n' are multiples of WGS2
|
||||||
|
// --> 'a_offset' is 0
|
||||||
|
// --> 'a_ld' is a multiple of VW2
|
||||||
|
// --> 'a_rotated' is 0
|
||||||
|
// --> 'do_conjugate' is 0
|
||||||
|
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||||
|
__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
|
||||||
|
const int a_rotated,
|
||||||
|
const __global realVF* restrict agm, const int a_offset, const int a_ld,
|
||||||
|
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||||
|
__global real* ygm, const int y_offset, const int y_inc,
|
||||||
|
const int do_conjugate, const int parameter,
|
||||||
|
const int kl, const int ku) {
|
||||||
|
// Local memory for the vector X
|
||||||
|
__local real xlm[WGS2];
|
||||||
|
|
||||||
|
// Initializes the accumulation register
|
||||||
|
real acc[WPT2];
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT2; ++w) {
|
||||||
|
SetToZero(acc[w]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loops over work-group sized portions of the work
|
||||||
|
for (int kwg=0; kwg<n; kwg+=WGS2) {
|
||||||
|
|
||||||
|
// Loads the vector X into local memory
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
|
||||||
|
|
||||||
|
// Synchronizes all threads in a workgroup
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// The multiply-add function (not rotated)
|
||||||
|
#pragma unroll
|
||||||
|
for (int kl=0; kl<WGS2; ++kl) {
|
||||||
|
const int k = kwg + kl;
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT2/VW2; ++w) {
|
||||||
|
const int gid = (WPT2/VW2)*get_global_id(0) + w;
|
||||||
|
realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
|
||||||
|
#if VW2 == 1
|
||||||
|
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
|
||||||
|
#elif VW2 == 2
|
||||||
|
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
|
||||||
|
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
|
||||||
|
#elif VW2 == 4
|
||||||
|
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
|
||||||
|
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
|
||||||
|
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.z);
|
||||||
|
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.w);
|
||||||
|
#elif VW2 == 8
|
||||||
|
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
|
||||||
|
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
|
||||||
|
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
|
||||||
|
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
|
||||||
|
MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
|
||||||
|
MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
|
||||||
|
MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
|
||||||
|
MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
|
||||||
|
#elif VW2 == 16
|
||||||
|
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
|
||||||
|
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
|
||||||
|
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
|
||||||
|
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
|
||||||
|
MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
|
||||||
|
MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
|
||||||
|
MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
|
||||||
|
MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
|
||||||
|
MultiplyAdd(acc[VW2*w+8], xlm[kl], avec.s8);
|
||||||
|
MultiplyAdd(acc[VW2*w+9], xlm[kl], avec.s9);
|
||||||
|
MultiplyAdd(acc[VW2*w+10], xlm[kl], avec.sA);
|
||||||
|
MultiplyAdd(acc[VW2*w+11], xlm[kl], avec.sB);
|
||||||
|
MultiplyAdd(acc[VW2*w+12], xlm[kl], avec.sC);
|
||||||
|
MultiplyAdd(acc[VW2*w+13], xlm[kl], avec.sD);
|
||||||
|
MultiplyAdd(acc[VW2*w+14], xlm[kl], avec.sE);
|
||||||
|
MultiplyAdd(acc[VW2*w+15], xlm[kl], avec.sF);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Synchronizes all threads in a workgroup
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the final result
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT2; ++w) {
|
||||||
|
const int gid = WPT2*get_global_id(0) + w;
|
||||||
|
real yval = ygm[gid*y_inc + y_offset];
|
||||||
|
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Faster version of the kernel, assuming that:
|
||||||
|
// --> 'm' and 'n' are multiples of WGS3
|
||||||
|
// --> 'a_offset' is 0
|
||||||
|
// --> 'a_ld' is a multiple of VW3
|
||||||
|
// --> 'a_rotated' is 1
|
||||||
|
// --> 'do_conjugate' is 0
|
||||||
|
__attribute__((reqd_work_group_size(WGS3, 1, 1)))
|
||||||
|
__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
|
||||||
|
const int a_rotated,
|
||||||
|
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
|
||||||
|
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||||
|
__global real* ygm, const int y_offset, const int y_inc,
|
||||||
|
const int do_conjugate, const int parameter,
|
||||||
|
const int kl, const int ku) {
|
||||||
|
// Local memory for the vector X
|
||||||
|
__local real xlm[WGS3];
|
||||||
|
|
||||||
|
// Initializes the accumulation register
|
||||||
|
real acc[WPT3];
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT3; ++w) {
|
||||||
|
SetToZero(acc[w]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loops over work-group sized portions of the work
|
||||||
|
for (int kwg=0; kwg<n; kwg+=WGS3) {
|
||||||
|
|
||||||
|
// Loads the vector X into local memory
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
|
||||||
|
|
||||||
|
// Synchronizes all threads in a workgroup
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// The multiply-add function (rotated)
|
||||||
|
#pragma unroll
|
||||||
|
for (int kl=0; kl<WGS3/VW3; ++kl) {
|
||||||
|
const int k = (kwg/VW3) + kl;
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT3; ++w) {
|
||||||
|
const int gid = WPT3*get_global_id(0) + w;
|
||||||
|
realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
|
||||||
|
#if VW3 == 1
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
|
||||||
|
#elif VW3 == 2
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
|
||||||
|
#elif VW3 == 4
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
|
||||||
|
#elif VW3 == 8
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
|
||||||
|
#elif VW3 == 16
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
|
||||||
|
MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Synchronizes all threads in a workgroup
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the final result
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT3; ++w) {
|
||||||
|
const int gid = WPT3*get_global_id(0) + w;
|
||||||
|
real yval = ygm[gid*y_inc + y_offset];
|
||||||
|
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// End of the C++11 raw string literal
|
||||||
|
)"
|
||||||
|
|
||||||
|
// =================================================================================================
|
106
src/kernels/level2/xger.opencl
Normal file
106
src/kernels/level2/xger.opencl
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file contains the Xger kernels for rank-1 matrix update.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||||
|
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||||
|
R"(
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
|
||||||
|
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||||
|
__kernel void Xger(const int max1, const int max2, const real alpha,
|
||||||
|
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||||
|
const __global real* ygm, const int y_offset, const int y_inc,
|
||||||
|
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||||
|
const int is_rowmajor) {
|
||||||
|
|
||||||
|
// Register storage for X and Y
|
||||||
|
real xvalues[WPT];
|
||||||
|
real yvalues[WPT];
|
||||||
|
|
||||||
|
// Row-major version
|
||||||
|
if (is_rowmajor) {
|
||||||
|
|
||||||
|
// Loads the X-vector
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT; ++w) {
|
||||||
|
const int id2 = w*get_global_size(1) + get_global_id(1);
|
||||||
|
xvalues[w] = LoadVector(id2, max2, xgm, x_offset, x_inc, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loads the Y-vector
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT; ++w) {
|
||||||
|
const int id1 = w*get_global_size(0) + get_global_id(0);
|
||||||
|
yvalues[w] = LoadVector(id1, max1, ygm, y_offset, y_inc, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loops over the work per thread twice
|
||||||
|
#pragma unroll
|
||||||
|
for (int w1=0; w1<WPT; ++w1) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int w2=0; w2<WPT; ++w2) {
|
||||||
|
|
||||||
|
// Global thread IDs
|
||||||
|
const int id1 = w1*get_global_size(0) + get_global_id(0);
|
||||||
|
const int id2 = w2*get_global_size(1) + get_global_id(1);
|
||||||
|
|
||||||
|
// Loads A, performs the operation, and stores the result into A
|
||||||
|
MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld,
|
||||||
|
alpha, xvalues[w2], yvalues[w1], false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Col-major version
|
||||||
|
else {
|
||||||
|
|
||||||
|
// Loads the X-vector
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT; ++w) {
|
||||||
|
const int id1 = w*get_global_size(0) + get_global_id(0);
|
||||||
|
xvalues[w] = LoadVector(id1, max1, xgm, x_offset, x_inc, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loads the Y-vector
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT; ++w) {
|
||||||
|
const int id2 = w*get_global_size(1) + get_global_id(1);
|
||||||
|
yvalues[w] = LoadVector(id2, max2, ygm, y_offset, y_inc, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loops over the work per thread twice
|
||||||
|
#pragma unroll
|
||||||
|
for (int w1=0; w1<WPT; ++w1) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int w2=0; w2<WPT; ++w2) {
|
||||||
|
|
||||||
|
// Global thread IDs
|
||||||
|
const int id1 = w1*get_global_size(0) + get_global_id(0);
|
||||||
|
const int id2 = w2*get_global_size(1) + get_global_id(1);
|
||||||
|
|
||||||
|
// Loads A, performs the operation, and stores the result into A
|
||||||
|
MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld,
|
||||||
|
alpha, xvalues[w1], yvalues[w2], false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// End of the C++11 raw string literal
|
||||||
|
)"
|
||||||
|
|
||||||
|
// =================================================================================================
|
73
src/kernels/level2/xher.opencl
Normal file
73
src/kernels/level2/xher.opencl
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file contains the Xher kernels for rank-1 matrix update.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||||
|
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||||
|
R"(
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
|
||||||
|
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||||
|
__kernel void Xher(const int n, const real alpha,
|
||||||
|
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||||
|
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||||
|
const int is_upper, const int is_rowmajor) {
|
||||||
|
|
||||||
|
// Register storage for X and XT
|
||||||
|
real xvalues[WPT];
|
||||||
|
real xtvalues[WPT];
|
||||||
|
|
||||||
|
// Loads the X-vector
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT; ++w) {
|
||||||
|
const int id2 = w*get_global_size(1) + get_global_id(1);
|
||||||
|
xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loads the X-transposed-vector
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT; ++w) {
|
||||||
|
const int id1 = w*get_global_size(0) + get_global_id(0);
|
||||||
|
xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loops over the work per thread twice
|
||||||
|
#pragma unroll
|
||||||
|
for (int w1=0; w1<WPT; ++w1) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int w2=0; w2<WPT; ++w2) {
|
||||||
|
|
||||||
|
// Global thread IDs
|
||||||
|
const int id1 = w1*get_global_size(0) + get_global_id(0);
|
||||||
|
const int id2 = w2*get_global_size(1) + get_global_id(1);
|
||||||
|
|
||||||
|
// Skip these threads if they do not contain threads contributing to the matrix-triangle
|
||||||
|
if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loads A, performs the operation, and stores the result into A
|
||||||
|
else {
|
||||||
|
MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[w2], xtvalues[w1], is_upper);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// End of the C++11 raw string literal
|
||||||
|
)"
|
||||||
|
|
||||||
|
// =================================================================================================
|
104
src/kernels/level2/xher2.opencl
Normal file
104
src/kernels/level2/xher2.opencl
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file contains the Xher2 kernels for rank-2 matrix update.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||||
|
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||||
|
R"(
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
|
||||||
|
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||||
|
__kernel void Xher2(const int n, const real alpha,
|
||||||
|
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||||
|
const __global real* restrict ygm, const int y_offset, const int y_inc,
|
||||||
|
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||||
|
const int is_upper, const int is_rowmajor) {
|
||||||
|
|
||||||
|
// Register storage for X and Y
|
||||||
|
real xvalues[WPT];
|
||||||
|
real yvalues[WPT];
|
||||||
|
real xtvalues[WPT];
|
||||||
|
real ytvalues[WPT];
|
||||||
|
|
||||||
|
// Loads the X-vector
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT; ++w) {
|
||||||
|
const int id2 = w*get_global_size(1) + get_global_id(1);
|
||||||
|
xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loads the X-transposed-vector
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT; ++w) {
|
||||||
|
const int id1 = w*get_global_size(0) + get_global_id(0);
|
||||||
|
xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loads the Y-vector
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT; ++w) {
|
||||||
|
const int id1 = w*get_global_size(0) + get_global_id(0);
|
||||||
|
yvalues[w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loads the Y-transposed-vector
|
||||||
|
#pragma unroll
|
||||||
|
for (int w=0; w<WPT; ++w) {
|
||||||
|
const int id2 = w*get_global_size(1) + get_global_id(1);
|
||||||
|
ytvalues[w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sets the proper value of alpha in case conjugation is needed
|
||||||
|
real alpha1 = alpha;
|
||||||
|
real alpha2 = alpha;
|
||||||
|
#if defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
|
||||||
|
if (is_rowmajor) {
|
||||||
|
COMPLEX_CONJUGATE(alpha1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
COMPLEX_CONJUGATE(alpha2);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Loops over the work per thread twice
|
||||||
|
#pragma unroll
|
||||||
|
for (int w1=0; w1<WPT; ++w1) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int w2=0; w2<WPT; ++w2) {
|
||||||
|
|
||||||
|
// Global thread IDs
|
||||||
|
const int id1 = w1*get_global_size(0) + get_global_id(0);
|
||||||
|
const int id2 = w2*get_global_size(1) + get_global_id(1);
|
||||||
|
|
||||||
|
// Skip these threads if they do not contain threads contributing to the matrix-triangle
|
||||||
|
if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loads A, performs the operation, and stores the result into A
|
||||||
|
else {
|
||||||
|
MatrixUpdate2(id1, id2, n, n, agm, a_offset, a_ld,
|
||||||
|
alpha1, xvalues[w2], yvalues[w1],
|
||||||
|
alpha2, xtvalues[w1], ytvalues[w2], is_upper);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// End of the C++11 raw string literal
|
||||||
|
)"
|
||||||
|
|
||||||
|
// =================================================================================================
|
329
src/kernels/level3/xgemm_part1.opencl
Normal file
329
src/kernels/level3/xgemm_part1.opencl
Normal file
|
@ -0,0 +1,329 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
|
||||||
|
// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
|
||||||
|
// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
|
||||||
|
// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
|
||||||
|
//
|
||||||
|
// Matrices are accessed as follows:
|
||||||
|
// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
|
||||||
|
// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
|
||||||
|
// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
|
||||||
|
//
|
||||||
|
// Or as an image (assuming column-major)
|
||||||
|
// K
|
||||||
|
// o-------o
|
||||||
|
// | |
|
||||||
|
// N | [B^T] |
|
||||||
|
// | |
|
||||||
|
// o-------o
|
||||||
|
// K N
|
||||||
|
// o-------o o-----o
|
||||||
|
// M | [A] | M | [C] |
|
||||||
|
// | | | |
|
||||||
|
// o-------o o-----o
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// This kernel is seperated into two files. This is part 1 out of 2,
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||||
|
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||||
|
R"(
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
||||||
|
// this kernel file is used outside of the CLBlast library.
|
||||||
|
#ifndef MWG
|
||||||
|
#define MWG 8 // Tile-size in dimension M (e.g. 64, 128)
|
||||||
|
#endif
|
||||||
|
#ifndef NWG
|
||||||
|
#define NWG 8 // Tile-size in dimension N (e.g. 64, 128)
|
||||||
|
#endif
|
||||||
|
#ifndef KWG
|
||||||
|
#define KWG 8 // Tile-size in dimension K (e.g. 8, 16)
|
||||||
|
#endif
|
||||||
|
#ifndef MDIMC
|
||||||
|
#define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
|
||||||
|
#endif
|
||||||
|
#ifndef NDIMC
|
||||||
|
#define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
|
||||||
|
#endif
|
||||||
|
#ifndef MDIMA
|
||||||
|
#define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
|
||||||
|
#endif
|
||||||
|
#ifndef NDIMB
|
||||||
|
#define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
|
||||||
|
#endif
|
||||||
|
#ifndef KWI
|
||||||
|
#define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG)
|
||||||
|
#endif
|
||||||
|
#ifndef VWM
|
||||||
|
#define VWM 1 // Vector width of matrices A and C
|
||||||
|
#endif
|
||||||
|
#ifndef VWN
|
||||||
|
#define VWN 1 // Vector width of matrix B
|
||||||
|
#endif
|
||||||
|
#ifndef STRM
|
||||||
|
#define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0)
|
||||||
|
#endif
|
||||||
|
#ifndef STRN
|
||||||
|
#define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0)
|
||||||
|
#endif
|
||||||
|
#ifndef SA
|
||||||
|
#define SA 0 // Use local/shared memory to cache matrix A (1) or not (0)
|
||||||
|
#endif
|
||||||
|
#ifndef SB
|
||||||
|
#define SB 0 // Use local/shared memory to cache matrix B (1) or not (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Helper parameters based on the above tuning parameters
|
||||||
|
#define MWI (MWG/MDIMC) // Work per work-item (M-dimension)
|
||||||
|
#define NWI (NWG/NDIMC) // Work per work-item (N-dimension)
|
||||||
|
#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
|
||||||
|
#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
|
||||||
|
#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension)
|
||||||
|
#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension)
|
||||||
|
#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension)
|
||||||
|
#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension)
|
||||||
|
|
||||||
|
// Settings
|
||||||
|
#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Data-widths in dimension M
|
||||||
|
#if VWM == 1
|
||||||
|
typedef real realM;
|
||||||
|
#elif VWM == 2
|
||||||
|
typedef real2 realM;
|
||||||
|
#elif VWM == 4
|
||||||
|
typedef real4 realM;
|
||||||
|
#elif VWM == 8
|
||||||
|
typedef real8 realM;
|
||||||
|
#elif VWM == 16
|
||||||
|
typedef real16 realM;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Data-widths in dimension N
|
||||||
|
#if VWN == 1
|
||||||
|
typedef real realN;
|
||||||
|
#elif VWN == 2
|
||||||
|
typedef real2 realN;
|
||||||
|
#elif VWN == 4
|
||||||
|
typedef real4 realN;
|
||||||
|
#elif VWN == 8
|
||||||
|
typedef real8 realN;
|
||||||
|
#elif VWN == 16
|
||||||
|
typedef real16 realN;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Initializes the accumulation registers to zero
|
||||||
|
inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mi=0; mi<MWI/VWM; ++mi) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int ni=0; ni<NWI; ++ni) {
|
||||||
|
#if VWM == 1
|
||||||
|
SetToZero(cpm[ni][mi]);
|
||||||
|
#elif VWM == 2
|
||||||
|
SetToZero(cpm[ni][mi].x);
|
||||||
|
SetToZero(cpm[ni][mi].y);
|
||||||
|
#elif VWM == 4
|
||||||
|
SetToZero(cpm[ni][mi].x);
|
||||||
|
SetToZero(cpm[ni][mi].y);
|
||||||
|
SetToZero(cpm[ni][mi].z);
|
||||||
|
SetToZero(cpm[ni][mi].w);
|
||||||
|
#elif VWM == 8
|
||||||
|
SetToZero(cpm[ni][mi].s0);
|
||||||
|
SetToZero(cpm[ni][mi].s1);
|
||||||
|
SetToZero(cpm[ni][mi].s2);
|
||||||
|
SetToZero(cpm[ni][mi].s3);
|
||||||
|
SetToZero(cpm[ni][mi].s4);
|
||||||
|
SetToZero(cpm[ni][mi].s5);
|
||||||
|
SetToZero(cpm[ni][mi].s6);
|
||||||
|
SetToZero(cpm[ni][mi].s7);
|
||||||
|
#elif VWM == 16
|
||||||
|
SetToZero(cpm[ni][mi].s0);
|
||||||
|
SetToZero(cpm[ni][mi].s1);
|
||||||
|
SetToZero(cpm[ni][mi].s2);
|
||||||
|
SetToZero(cpm[ni][mi].s3);
|
||||||
|
SetToZero(cpm[ni][mi].s4);
|
||||||
|
SetToZero(cpm[ni][mi].s5);
|
||||||
|
SetToZero(cpm[ni][mi].s6);
|
||||||
|
SetToZero(cpm[ni][mi].s7);
|
||||||
|
SetToZero(cpm[ni][mi].s8);
|
||||||
|
SetToZero(cpm[ni][mi].s9);
|
||||||
|
SetToZero(cpm[ni][mi].sA);
|
||||||
|
SetToZero(cpm[ni][mi].sB);
|
||||||
|
SetToZero(cpm[ni][mi].sC);
|
||||||
|
SetToZero(cpm[ni][mi].sD);
|
||||||
|
SetToZero(cpm[ni][mi].sE);
|
||||||
|
SetToZero(cpm[ni][mi].sF);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
|
||||||
|
// caching the A input matrix.
|
||||||
|
#if SA == 1
|
||||||
|
inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
|
||||||
|
const int kSizeM, const int tid, const int kwg) {
|
||||||
|
const int la0 = tid % MDIMA;
|
||||||
|
const int la1 = tid / MDIMA;
|
||||||
|
#pragma unroll
|
||||||
|
for (int mia=0; mia<MWA/VWM; ++mia) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int kia=0; kia<KWA; ++kia) {
|
||||||
|
|
||||||
|
// Computes the indices based on strided/non-strided access
|
||||||
|
#if STRM == 0
|
||||||
|
int mg = mia + la0*(MWA/VWM);
|
||||||
|
#elif STRM == 1
|
||||||
|
int mg = la0 + mia*MDIMA;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Computes the indices for the global memory
|
||||||
|
int kg = kia + la1*KWA;
|
||||||
|
int idm = mg + get_group_id(0)*(MWG/VWM);
|
||||||
|
int idk = kg + kwg;
|
||||||
|
|
||||||
|
// Loads the data from global memory (not transposed) into the local memory
|
||||||
|
alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Same as above, but now for the B input matrix
|
||||||
|
#if SB == 1
|
||||||
|
inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
|
||||||
|
const int kSizeN, const int tid, const int kwg) {
|
||||||
|
const int lb0 = tid % NDIMB;
|
||||||
|
const int lb1 = tid / NDIMB;
|
||||||
|
#pragma unroll
|
||||||
|
for (int kib=0; kib<KWB; ++kib) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int nib=0; nib<NWB/VWN; ++nib) {
|
||||||
|
|
||||||
|
// Computes the indices based on strided/non-strided access
|
||||||
|
#if STRN == 0
|
||||||
|
int ng = nib + lb0*(NWB/VWN);
|
||||||
|
#elif STRN == 1
|
||||||
|
int ng = lb0 + nib*NDIMB;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Computes the indices for the global memory
|
||||||
|
int kg = kib + lb1*KWB;
|
||||||
|
int idn = ng + get_group_id(1)*(NWG/VWN);
|
||||||
|
int idk = kg + kwg;
|
||||||
|
|
||||||
|
// Loads the data from global memory (transposed) into the local memory
|
||||||
|
blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Caches global off-chip memory directly into per-thread private memory (registers). This function
|
||||||
|
// is specific for caching the A input matrix.
|
||||||
|
#if SA == 0
|
||||||
|
inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
|
||||||
|
const int kSizeM, const int idk, const int kwg) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mi=0; mi<MWI/VWM; ++mi) {
|
||||||
|
|
||||||
|
// Computes the indices based on strided/non-strided access
|
||||||
|
#if STRM == 0
|
||||||
|
int mg = mi + get_local_id(0)*(MWI/VWM);
|
||||||
|
#elif STRM == 1
|
||||||
|
int mg = get_local_id(0) + mi*MDIMC;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Computes the indices for the global memory
|
||||||
|
int idm = mg + get_group_id(0)*(MWG/VWM);
|
||||||
|
|
||||||
|
// Loads the data from global memory (not transposed) and stores into registers
|
||||||
|
apm[mi] = agm[idk*(kSizeM/VWM) + idm];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Same as above, but now for the B input matrix
|
||||||
|
#if SB == 0
|
||||||
|
inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
|
||||||
|
const int kSizeN, const int idk) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int ni=0; ni<NWI/VWN; ++ni) {
|
||||||
|
|
||||||
|
// Computes the indices based on strided/non-strided access
|
||||||
|
#if STRN == 0
|
||||||
|
int ng = ni + get_local_id(1)*(NWI/VWN);
|
||||||
|
#elif STRN == 1
|
||||||
|
int ng = get_local_id(1) + ni*NDIMC;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Computes the indices for the global memory
|
||||||
|
int idn = ng + get_group_id(1)*(NWG/VWN);
|
||||||
|
|
||||||
|
// Loads the data from global memory (transposed) and stores into registers
|
||||||
|
bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Caches on-chip local memory into per-thread private memory (registers). This function is specific
|
||||||
|
// for caching the A input matrix.
|
||||||
|
#if SA == 1
|
||||||
|
inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mi=0; mi<MWI/VWM; ++mi) {
|
||||||
|
#if STRM == 0
|
||||||
|
int mg = mi + get_local_id(0)*(MWI/VWM);
|
||||||
|
#elif STRM == 1
|
||||||
|
int mg = get_local_id(0) + mi*MDIMC;
|
||||||
|
#endif
|
||||||
|
apm[mi] = alm[kg*(MWG/VWM) + mg];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Same as above, but now for the B input matrix
|
||||||
|
#if SB == 1
|
||||||
|
inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int ni=0; ni<NWI/VWN; ++ni) {
|
||||||
|
#if STRN == 0
|
||||||
|
int ng = ni + get_local_id(1)*(NWI/VWN);
|
||||||
|
#elif STRN == 1
|
||||||
|
int ng = get_local_id(1) + ni*NDIMC;
|
||||||
|
#endif
|
||||||
|
bpm[ni] = blm[kg*(NWG/VWN) + ng];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// End of the C++11 raw string literal
|
||||||
|
)"
|
||||||
|
|
||||||
|
// =================================================================================================
|
|
@ -7,29 +7,7 @@
|
||||||
// Author(s):
|
// Author(s):
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
//
|
//
|
||||||
// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
|
// This is part 2 of 2 of the GEMM kernel. See part 1 for more information.
|
||||||
// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
|
|
||||||
// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
|
|
||||||
// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
|
|
||||||
//
|
|
||||||
// Matrices are accessed as follows:
|
|
||||||
// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
|
|
||||||
// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
|
|
||||||
// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
|
|
||||||
//
|
|
||||||
// Or as an image (assuming column-major)
|
|
||||||
// K
|
|
||||||
// o-------o
|
|
||||||
// | |
|
|
||||||
// N | [B^T] |
|
|
||||||
// | |
|
|
||||||
// o-------o
|
|
||||||
// K N
|
|
||||||
// o-------o o-----o
|
|
||||||
// M | [A] | M | [C] |
|
|
||||||
// | | | |
|
|
||||||
// o-------o o-----o
|
|
||||||
//
|
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
@ -39,288 +17,6 @@ R"(
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
|
||||||
// this kernel file is used outside of the CLBlast library.
|
|
||||||
#ifndef MWG
|
|
||||||
#define MWG 8 // Tile-size in dimension M (e.g. 64, 128)
|
|
||||||
#endif
|
|
||||||
#ifndef NWG
|
|
||||||
#define NWG 8 // Tile-size in dimension N (e.g. 64, 128)
|
|
||||||
#endif
|
|
||||||
#ifndef KWG
|
|
||||||
#define KWG 8 // Tile-size in dimension K (e.g. 8, 16)
|
|
||||||
#endif
|
|
||||||
#ifndef MDIMC
|
|
||||||
#define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
|
|
||||||
#endif
|
|
||||||
#ifndef NDIMC
|
|
||||||
#define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
|
|
||||||
#endif
|
|
||||||
#ifndef MDIMA
|
|
||||||
#define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
|
|
||||||
#endif
|
|
||||||
#ifndef NDIMB
|
|
||||||
#define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
|
|
||||||
#endif
|
|
||||||
#ifndef KWI
|
|
||||||
#define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG)
|
|
||||||
#endif
|
|
||||||
#ifndef VWM
|
|
||||||
#define VWM 1 // Vector width of matrices A and C
|
|
||||||
#endif
|
|
||||||
#ifndef VWN
|
|
||||||
#define VWN 1 // Vector width of matrix B
|
|
||||||
#endif
|
|
||||||
#ifndef STRM
|
|
||||||
#define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0)
|
|
||||||
#endif
|
|
||||||
#ifndef STRN
|
|
||||||
#define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0)
|
|
||||||
#endif
|
|
||||||
#ifndef SA
|
|
||||||
#define SA 0 // Use local/shared memory to cache matrix A (1) or not (0)
|
|
||||||
#endif
|
|
||||||
#ifndef SB
|
|
||||||
#define SB 0 // Use local/shared memory to cache matrix B (1) or not (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Helper parameters based on the above tuning parameters
|
|
||||||
#define MWI (MWG/MDIMC) // Work per work-item (M-dimension)
|
|
||||||
#define NWI (NWG/NDIMC) // Work per work-item (N-dimension)
|
|
||||||
#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
|
|
||||||
#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
|
|
||||||
#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension)
|
|
||||||
#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension)
|
|
||||||
#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension)
|
|
||||||
#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension)
|
|
||||||
|
|
||||||
// Settings
|
|
||||||
#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Data-widths in dimension M
|
|
||||||
#if VWM == 1
|
|
||||||
typedef real realM;
|
|
||||||
#elif VWM == 2
|
|
||||||
typedef real2 realM;
|
|
||||||
#elif VWM == 4
|
|
||||||
typedef real4 realM;
|
|
||||||
#elif VWM == 8
|
|
||||||
typedef real8 realM;
|
|
||||||
#elif VWM == 16
|
|
||||||
typedef real16 realM;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Data-widths in dimension N
|
|
||||||
#if VWN == 1
|
|
||||||
typedef real realN;
|
|
||||||
#elif VWN == 2
|
|
||||||
typedef real2 realN;
|
|
||||||
#elif VWN == 4
|
|
||||||
typedef real4 realN;
|
|
||||||
#elif VWN == 8
|
|
||||||
typedef real8 realN;
|
|
||||||
#elif VWN == 16
|
|
||||||
typedef real16 realN;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Initializes the accumulation registers to zero
|
|
||||||
inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mi=0; mi<MWI/VWM; ++mi) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int ni=0; ni<NWI; ++ni) {
|
|
||||||
#if VWM == 1
|
|
||||||
SetToZero(cpm[ni][mi]);
|
|
||||||
#elif VWM == 2
|
|
||||||
SetToZero(cpm[ni][mi].x);
|
|
||||||
SetToZero(cpm[ni][mi].y);
|
|
||||||
#elif VWM == 4
|
|
||||||
SetToZero(cpm[ni][mi].x);
|
|
||||||
SetToZero(cpm[ni][mi].y);
|
|
||||||
SetToZero(cpm[ni][mi].z);
|
|
||||||
SetToZero(cpm[ni][mi].w);
|
|
||||||
#elif VWM == 8
|
|
||||||
SetToZero(cpm[ni][mi].s0);
|
|
||||||
SetToZero(cpm[ni][mi].s1);
|
|
||||||
SetToZero(cpm[ni][mi].s2);
|
|
||||||
SetToZero(cpm[ni][mi].s3);
|
|
||||||
SetToZero(cpm[ni][mi].s4);
|
|
||||||
SetToZero(cpm[ni][mi].s5);
|
|
||||||
SetToZero(cpm[ni][mi].s6);
|
|
||||||
SetToZero(cpm[ni][mi].s7);
|
|
||||||
#elif VWM == 16
|
|
||||||
SetToZero(cpm[ni][mi].s0);
|
|
||||||
SetToZero(cpm[ni][mi].s1);
|
|
||||||
SetToZero(cpm[ni][mi].s2);
|
|
||||||
SetToZero(cpm[ni][mi].s3);
|
|
||||||
SetToZero(cpm[ni][mi].s4);
|
|
||||||
SetToZero(cpm[ni][mi].s5);
|
|
||||||
SetToZero(cpm[ni][mi].s6);
|
|
||||||
SetToZero(cpm[ni][mi].s7);
|
|
||||||
SetToZero(cpm[ni][mi].s8);
|
|
||||||
SetToZero(cpm[ni][mi].s9);
|
|
||||||
SetToZero(cpm[ni][mi].sA);
|
|
||||||
SetToZero(cpm[ni][mi].sB);
|
|
||||||
SetToZero(cpm[ni][mi].sC);
|
|
||||||
SetToZero(cpm[ni][mi].sD);
|
|
||||||
SetToZero(cpm[ni][mi].sE);
|
|
||||||
SetToZero(cpm[ni][mi].sF);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
|
|
||||||
// caching the A input matrix.
|
|
||||||
#if SA == 1
|
|
||||||
inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
|
|
||||||
const int kSizeM, const int tid, const int kwg) {
|
|
||||||
const int la0 = tid % MDIMA;
|
|
||||||
const int la1 = tid / MDIMA;
|
|
||||||
#pragma unroll
|
|
||||||
for (int mia=0; mia<MWA/VWM; ++mia) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int kia=0; kia<KWA; ++kia) {
|
|
||||||
|
|
||||||
// Computes the indices based on strided/non-strided access
|
|
||||||
#if STRM == 0
|
|
||||||
int mg = mia + la0*(MWA/VWM);
|
|
||||||
#elif STRM == 1
|
|
||||||
int mg = la0 + mia*MDIMA;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Computes the indices for the global memory
|
|
||||||
int kg = kia + la1*KWA;
|
|
||||||
int idm = mg + get_group_id(0)*(MWG/VWM);
|
|
||||||
int idk = kg + kwg;
|
|
||||||
|
|
||||||
// Loads the data from global memory (not transposed) into the local memory
|
|
||||||
alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Same as above, but now for the B input matrix
|
|
||||||
#if SB == 1
|
|
||||||
inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
|
|
||||||
const int kSizeN, const int tid, const int kwg) {
|
|
||||||
const int lb0 = tid % NDIMB;
|
|
||||||
const int lb1 = tid / NDIMB;
|
|
||||||
#pragma unroll
|
|
||||||
for (int kib=0; kib<KWB; ++kib) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int nib=0; nib<NWB/VWN; ++nib) {
|
|
||||||
|
|
||||||
// Computes the indices based on strided/non-strided access
|
|
||||||
#if STRN == 0
|
|
||||||
int ng = nib + lb0*(NWB/VWN);
|
|
||||||
#elif STRN == 1
|
|
||||||
int ng = lb0 + nib*NDIMB;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Computes the indices for the global memory
|
|
||||||
int kg = kib + lb1*KWB;
|
|
||||||
int idn = ng + get_group_id(1)*(NWG/VWN);
|
|
||||||
int idk = kg + kwg;
|
|
||||||
|
|
||||||
// Loads the data from global memory (transposed) into the local memory
|
|
||||||
blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Caches global off-chip memory directly into per-thread private memory (registers). This function
|
|
||||||
// is specific for caching the A input matrix.
|
|
||||||
#if SA == 0
|
|
||||||
inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
|
|
||||||
const int kSizeM, const int idk, const int kwg) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mi=0; mi<MWI/VWM; ++mi) {
|
|
||||||
|
|
||||||
// Computes the indices based on strided/non-strided access
|
|
||||||
#if STRM == 0
|
|
||||||
int mg = mi + get_local_id(0)*(MWI/VWM);
|
|
||||||
#elif STRM == 1
|
|
||||||
int mg = get_local_id(0) + mi*MDIMC;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Computes the indices for the global memory
|
|
||||||
int idm = mg + get_group_id(0)*(MWG/VWM);
|
|
||||||
|
|
||||||
// Loads the data from global memory (not transposed) and stores into registers
|
|
||||||
apm[mi] = agm[idk*(kSizeM/VWM) + idm];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Same as above, but now for the B input matrix
|
|
||||||
#if SB == 0
|
|
||||||
inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
|
|
||||||
const int kSizeN, const int idk) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int ni=0; ni<NWI/VWN; ++ni) {
|
|
||||||
|
|
||||||
// Computes the indices based on strided/non-strided access
|
|
||||||
#if STRN == 0
|
|
||||||
int ng = ni + get_local_id(1)*(NWI/VWN);
|
|
||||||
#elif STRN == 1
|
|
||||||
int ng = get_local_id(1) + ni*NDIMC;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Computes the indices for the global memory
|
|
||||||
int idn = ng + get_group_id(1)*(NWG/VWN);
|
|
||||||
|
|
||||||
// Loads the data from global memory (transposed) and stores into registers
|
|
||||||
bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Caches on-chip local memory into per-thread private memory (registers). This function is specific
|
|
||||||
// for caching the A input matrix.
|
|
||||||
#if SA == 1
|
|
||||||
inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mi=0; mi<MWI/VWM; ++mi) {
|
|
||||||
#if STRM == 0
|
|
||||||
int mg = mi + get_local_id(0)*(MWI/VWM);
|
|
||||||
#elif STRM == 1
|
|
||||||
int mg = get_local_id(0) + mi*MDIMC;
|
|
||||||
#endif
|
|
||||||
apm[mi] = alm[kg*(MWG/VWM) + mg];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Same as above, but now for the B input matrix
|
|
||||||
#if SB == 1
|
|
||||||
inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int ni=0; ni<NWI/VWN; ++ni) {
|
|
||||||
#if STRN == 0
|
|
||||||
int ng = ni + get_local_id(1)*(NWI/VWN);
|
|
||||||
#elif STRN == 1
|
|
||||||
int ng = get_local_id(1) + ni*NDIMC;
|
|
||||||
#endif
|
|
||||||
bpm[ni] = blm[kg*(NWG/VWN) + ng];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// The vectorised multiply-add function
|
// The vectorised multiply-add function
|
||||||
inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
|
inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
|
||||||
#if USE_VECTOR_MAD == 1
|
#if USE_VECTOR_MAD == 1
|
|
@ -14,7 +14,6 @@
|
||||||
#include "internal/routines/level1/xdotu.h"
|
#include "internal/routines/level1/xdotu.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -33,6 +33,7 @@ Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
|
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level2/xgemv.opencl"
|
#include "../../kernels/level2/xgemv.opencl"
|
||||||
|
#include "../../kernels/level2/xgemv_fast.opencl"
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
112
src/routines/level2/xger.cc
Normal file
112
src/routines/level2/xger.cc
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xger class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xger.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specific implementations to get the memory-type based on a template argument
|
||||||
|
template <> const Precision Xger<float>::precision_ = Precision::kSingle;
|
||||||
|
template <> const Precision Xger<double>::precision_ = Precision::kDouble;
|
||||||
|
template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
|
||||||
|
template <> const Precision Xger<double2>::precision_ = Precision::kComplexDouble;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xger<T>::Xger(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Routine<T>(queue, event, name, {"Xger"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/level2/level2.opencl"
|
||||||
|
#include "../../kernels/level2/xger.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xger<T>::DoGer(const Layout layout,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
|
||||||
|
|
||||||
|
// Makes sure all dimensions are larger than zero
|
||||||
|
if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// Computes whether or not the matrix has an alternative layout (row or column-major).
|
||||||
|
const auto a_is_rowmajor = (layout == Layout::kRowMajor);
|
||||||
|
const auto a_one = (a_is_rowmajor) ? n : m;
|
||||||
|
const auto a_two = (a_is_rowmajor) ? m : n;
|
||||||
|
|
||||||
|
// Tests the matrix and the vectors for validity
|
||||||
|
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestVectorX(m, x_buffer, x_offset, x_inc, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Retrieves the Xgemv kernel from the compiled binary
|
||||||
|
try {
|
||||||
|
auto& program = GetProgramFromCache();
|
||||||
|
auto kernel = Kernel(program, "Xger");
|
||||||
|
|
||||||
|
// Sets the kernel arguments
|
||||||
|
kernel.SetArgument(0, static_cast<int>(a_one));
|
||||||
|
kernel.SetArgument(1, static_cast<int>(a_two));
|
||||||
|
kernel.SetArgument(2, alpha);
|
||||||
|
kernel.SetArgument(3, x_buffer());
|
||||||
|
kernel.SetArgument(4, static_cast<int>(x_offset));
|
||||||
|
kernel.SetArgument(5, static_cast<int>(x_inc));
|
||||||
|
kernel.SetArgument(6, y_buffer());
|
||||||
|
kernel.SetArgument(7, static_cast<int>(y_offset));
|
||||||
|
kernel.SetArgument(8, static_cast<int>(y_inc));
|
||||||
|
kernel.SetArgument(9, a_buffer());
|
||||||
|
kernel.SetArgument(10, static_cast<int>(a_offset));
|
||||||
|
kernel.SetArgument(11, static_cast<int>(a_ld));
|
||||||
|
kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
|
||||||
|
|
||||||
|
// Launches the kernel
|
||||||
|
auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
|
||||||
|
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
|
||||||
|
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
|
||||||
|
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Waits for all kernels to finish
|
||||||
|
queue_.Finish();
|
||||||
|
|
||||||
|
// Succesfully finished the computation
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xger<float>;
|
||||||
|
template class Xger<double>;
|
||||||
|
template class Xger<float2>;
|
||||||
|
template class Xger<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
53
src/routines/level2/xgerc.cc
Normal file
53
src/routines/level2/xgerc.cc
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xgerc class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xgerc.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xgerc<T>::Xgerc(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Xger<T>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xgerc<T>::DoGerc(const Layout layout,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
|
||||||
|
|
||||||
|
// Regular Ger operation on complex data, plus conjugation in the kernel guarded by the
|
||||||
|
// ROUTINE_GERC guard.
|
||||||
|
return DoGer(layout, m, n, alpha,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
y_buffer, y_offset, y_inc,
|
||||||
|
a_buffer, a_offset, a_ld);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xgerc<float2>;
|
||||||
|
template class Xgerc<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
52
src/routines/level2/xgeru.cc
Normal file
52
src/routines/level2/xgeru.cc
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xgeru class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xgeru.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xgeru<T>::Xgeru(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Xger<T>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xgeru<T>::DoGeru(const Layout layout,
|
||||||
|
const size_t m, const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
|
||||||
|
|
||||||
|
// Regular Ger operation on complex data
|
||||||
|
return DoGer(layout, m, n, alpha,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
y_buffer, y_offset, y_inc,
|
||||||
|
a_buffer, a_offset, a_ld);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xgeru<float2>;
|
||||||
|
template class Xgeru<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
122
src/routines/level2/xher.cc
Normal file
122
src/routines/level2/xher.cc
Normal file
|
@ -0,0 +1,122 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xher class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xher.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specific implementations to get the memory-type based on a template argument
|
||||||
|
template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
|
||||||
|
template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
|
||||||
|
template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
|
||||||
|
template <> const Precision Xher<double2, double>::precision_ = Precision::kComplexDouble;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T, typename U>
|
||||||
|
Xher<T,U>::Xher(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Routine<T>(queue, event, name, {"Xger"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/level2/level2.opencl"
|
||||||
|
#include "../../kernels/level2/xher.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specializations to compute alpha of type 'T'
|
||||||
|
template <> float2 Xher<float2,float>::GetAlpha(const float alpha) { return float2{alpha, 0.0f}; }
|
||||||
|
template <> double2 Xher<double2,double>::GetAlpha(const double alpha) { return double2{alpha, 0.0}; }
|
||||||
|
template <> float Xher<float,float>::GetAlpha(const float alpha) { return alpha; }
|
||||||
|
template <> double Xher<double,double>::GetAlpha(const double alpha) { return alpha; }
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T, typename U>
|
||||||
|
StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const U alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const bool packed) {
|
||||||
|
|
||||||
|
// Makes sure the dimensions are larger than zero
|
||||||
|
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// The data is either in the upper or lower triangle
|
||||||
|
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
|
||||||
|
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
|
||||||
|
const auto is_rowmajor = (layout == Layout::kRowMajor);
|
||||||
|
|
||||||
|
// Creates a matching version of alpha
|
||||||
|
const auto matching_alpha = GetAlpha(alpha);
|
||||||
|
|
||||||
|
// Tests the matrix and the vectors for validity
|
||||||
|
auto status = StatusCode::kSuccess;
|
||||||
|
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
|
||||||
|
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// If alpha is zero an update is not required
|
||||||
|
if (alpha == U{0}) { return StatusCode::kSuccess; }
|
||||||
|
|
||||||
|
// Retrieves the Xgemv kernel from the compiled binary
|
||||||
|
try {
|
||||||
|
auto& program = GetProgramFromCache();
|
||||||
|
auto kernel = Kernel(program, "Xher");
|
||||||
|
|
||||||
|
// Sets the kernel arguments
|
||||||
|
kernel.SetArgument(0, static_cast<int>(n));
|
||||||
|
kernel.SetArgument(1, matching_alpha);
|
||||||
|
kernel.SetArgument(2, x_buffer());
|
||||||
|
kernel.SetArgument(3, static_cast<int>(x_offset));
|
||||||
|
kernel.SetArgument(4, static_cast<int>(x_inc));
|
||||||
|
kernel.SetArgument(5, a_buffer());
|
||||||
|
kernel.SetArgument(6, static_cast<int>(a_offset));
|
||||||
|
kernel.SetArgument(7, static_cast<int>(a_ld));
|
||||||
|
kernel.SetArgument(8, static_cast<int>(is_upper));
|
||||||
|
kernel.SetArgument(9, static_cast<int>(is_rowmajor));
|
||||||
|
|
||||||
|
// Launches the kernel
|
||||||
|
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
|
||||||
|
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
|
||||||
|
auto global = std::vector<size_t>{global_one, global_two};
|
||||||
|
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Waits for all kernels to finish
|
||||||
|
queue_.Finish();
|
||||||
|
|
||||||
|
// Succesfully finished the computation
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xher<float, float>;
|
||||||
|
template class Xher<double, double>;
|
||||||
|
template class Xher<float2, float>;
|
||||||
|
template class Xher<double2, double>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
114
src/routines/level2/xher2.cc
Normal file
114
src/routines/level2/xher2.cc
Normal file
|
@ -0,0 +1,114 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xher2 class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xher2.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specific implementations to get the memory-type based on a template argument
|
||||||
|
template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
|
||||||
|
template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
|
||||||
|
template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
|
||||||
|
template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDouble;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xher2<T>::Xher2(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Routine<T>(queue, event, name, {"Xger"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/level2/level2.opencl"
|
||||||
|
#include "../../kernels/level2/xher2.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
|
const bool packed) {
|
||||||
|
|
||||||
|
// Makes sure the dimensions are larger than zero
|
||||||
|
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// The data is either in the upper or lower triangle
|
||||||
|
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
|
||||||
|
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
|
||||||
|
const auto is_rowmajor = (layout == Layout::kRowMajor);
|
||||||
|
|
||||||
|
// Tests the matrix and the vectors for validity
|
||||||
|
auto status = StatusCode::kSuccess;
|
||||||
|
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
|
||||||
|
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Retrieves the Xgemv kernel from the compiled binary
|
||||||
|
try {
|
||||||
|
auto& program = GetProgramFromCache();
|
||||||
|
auto kernel = Kernel(program, "Xher2");
|
||||||
|
|
||||||
|
// Sets the kernel arguments
|
||||||
|
kernel.SetArgument(0, static_cast<int>(n));
|
||||||
|
kernel.SetArgument(1, alpha);
|
||||||
|
kernel.SetArgument(2, x_buffer());
|
||||||
|
kernel.SetArgument(3, static_cast<int>(x_offset));
|
||||||
|
kernel.SetArgument(4, static_cast<int>(x_inc));
|
||||||
|
kernel.SetArgument(5, y_buffer());
|
||||||
|
kernel.SetArgument(6, static_cast<int>(y_offset));
|
||||||
|
kernel.SetArgument(7, static_cast<int>(y_inc));
|
||||||
|
kernel.SetArgument(8, a_buffer());
|
||||||
|
kernel.SetArgument(9, static_cast<int>(a_offset));
|
||||||
|
kernel.SetArgument(10, static_cast<int>(a_ld));
|
||||||
|
kernel.SetArgument(11, static_cast<int>(is_upper));
|
||||||
|
kernel.SetArgument(12, static_cast<int>(is_rowmajor));
|
||||||
|
|
||||||
|
// Launches the kernel
|
||||||
|
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
|
||||||
|
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
|
||||||
|
auto global = std::vector<size_t>{global_one, global_two};
|
||||||
|
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
|
||||||
|
status = RunKernel(kernel, global, local);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Waits for all kernels to finish
|
||||||
|
queue_.Finish();
|
||||||
|
|
||||||
|
// Succesfully finished the computation
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xher2<float>;
|
||||||
|
template class Xher2<double>;
|
||||||
|
template class Xher2<float2>;
|
||||||
|
template class Xher2<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
51
src/routines/level2/xhpr.cc
Normal file
51
src/routines/level2/xhpr.cc
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xhpr class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xhpr.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T, typename U>
|
||||||
|
Xhpr<T,U>::Xhpr(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Xher<T,U>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T, typename U>
|
||||||
|
StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const U alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &ap_buffer, const size_t ap_offset) {
|
||||||
|
|
||||||
|
// Specific Xhpr functionality is implemented in the kernel using defines
|
||||||
|
return DoHer(layout, triangle, n, alpha,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
ap_buffer, ap_offset, n,
|
||||||
|
true); // packed matrix
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xhpr<float2, float>;
|
||||||
|
template class Xhpr<double2, double>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
53
src/routines/level2/xhpr2.cc
Normal file
53
src/routines/level2/xhpr2.cc
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xhpr2 class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xhpr2.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xhpr2<T>::Xhpr2(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Xher2<T>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &ap_buffer, const size_t ap_offset) {
|
||||||
|
|
||||||
|
// Specific Xhpr2 functionality is implemented in the kernel using defines
|
||||||
|
return DoHer2(layout, triangle, n, alpha,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
y_buffer, y_offset, y_inc,
|
||||||
|
ap_buffer, ap_offset, n,
|
||||||
|
true); // packed matrix
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xhpr2<float2>;
|
||||||
|
template class Xhpr2<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
51
src/routines/level2/xspr.cc
Normal file
51
src/routines/level2/xspr.cc
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xspr class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xspr.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xspr<T>::Xspr(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Xher<T,T>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &ap_buffer, const size_t ap_offset) {
|
||||||
|
|
||||||
|
// Specific Xspr functionality is implemented in the kernel using defines
|
||||||
|
return DoHer(layout, triangle, n, alpha,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
ap_buffer, ap_offset, n,
|
||||||
|
true); // packed matrix
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xspr<float>;
|
||||||
|
template class Xspr<double>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
53
src/routines/level2/xspr2.cc
Normal file
53
src/routines/level2/xspr2.cc
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xspr2 class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xspr2.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xspr2<T>::Xspr2(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Xher2<T>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &ap_buffer, const size_t ap_offset) {
|
||||||
|
|
||||||
|
// Specific Xspr2 functionality is implemented in the kernel using defines
|
||||||
|
return DoHer2(layout, triangle, n, alpha,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
y_buffer, y_offset, y_inc,
|
||||||
|
ap_buffer, ap_offset, n,
|
||||||
|
true); // packed matrix
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xspr2<float>;
|
||||||
|
template class Xspr2<double>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
50
src/routines/level2/xsyr.cc
Normal file
50
src/routines/level2/xsyr.cc
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xsyr class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xsyr.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xsyr<T>::Xsyr(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Xher<T,T>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
|
||||||
|
|
||||||
|
// Specific Xsyr functionality is implemented in the kernel using defines
|
||||||
|
return DoHer(layout, triangle, n, alpha,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
a_buffer, a_offset, a_ld);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xsyr<float>;
|
||||||
|
template class Xsyr<double>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
52
src/routines/level2/xsyr2.cc
Normal file
52
src/routines/level2/xsyr2.cc
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xsyr2 class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level2/xsyr2.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xsyr2<T>::Xsyr2(Queue &queue, Event &event, const std::string &name):
|
||||||
|
Xher2<T>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
|
||||||
|
const size_t n,
|
||||||
|
const T alpha,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
|
||||||
|
|
||||||
|
// Specific Xsyr2 functionality is implemented in the kernel using defines
|
||||||
|
return DoHer2(layout, triangle, n, alpha,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
y_buffer, y_offset, y_inc,
|
||||||
|
a_buffer, a_offset, a_ld);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xsyr2<float>;
|
||||||
|
template class Xsyr2<double>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
|
@ -30,13 +30,14 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xgemm<T>::Xgemm(Queue &queue, Event &event, const std::string &name):
|
Xgemm<T>::Xgemm(Queue &queue, Event &event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
|
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level3/copy.opencl"
|
#include "../../kernels/level3/copy.opencl"
|
||||||
#include "../../kernels/level3/pad.opencl"
|
#include "../../kernels/level3/pad.opencl"
|
||||||
#include "../../kernels/level3/transpose.opencl"
|
#include "../../kernels/level3/transpose.opencl"
|
||||||
#include "../../kernels/level3/padtranspose.opencl"
|
#include "../../kernels/level3/padtranspose.opencl"
|
||||||
#include "../../kernels/level3/xgemm.opencl"
|
#include "../../kernels/level3/xgemm_part1.opencl"
|
||||||
|
#include "../../kernels/level3/xgemm_part2.opencl"
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,13 +28,14 @@ template <> const Precision Xher2k<double2,double>::precision_ = Precision::kCom
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T, typename U>
|
template <typename T, typename U>
|
||||||
Xher2k<T,U>::Xher2k(Queue &queue, Event &event, const std::string &name):
|
Xher2k<T,U>::Xher2k(Queue &queue, Event &event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
|
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level3/copy.opencl"
|
#include "../../kernels/level3/copy.opencl"
|
||||||
#include "../../kernels/level3/pad.opencl"
|
#include "../../kernels/level3/pad.opencl"
|
||||||
#include "../../kernels/level3/transpose.opencl"
|
#include "../../kernels/level3/transpose.opencl"
|
||||||
#include "../../kernels/level3/padtranspose.opencl"
|
#include "../../kernels/level3/padtranspose.opencl"
|
||||||
#include "../../kernels/level3/xgemm.opencl"
|
#include "../../kernels/level3/xgemm_part1.opencl"
|
||||||
|
#include "../../kernels/level3/xgemm_part2.opencl"
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,13 +28,14 @@ template <> const Precision Xherk<double2,double>::precision_ = Precision::kComp
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T, typename U>
|
template <typename T, typename U>
|
||||||
Xherk<T,U>::Xherk(Queue &queue, Event &event, const std::string &name):
|
Xherk<T,U>::Xherk(Queue &queue, Event &event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
|
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level3/copy.opencl"
|
#include "../../kernels/level3/copy.opencl"
|
||||||
#include "../../kernels/level3/pad.opencl"
|
#include "../../kernels/level3/pad.opencl"
|
||||||
#include "../../kernels/level3/transpose.opencl"
|
#include "../../kernels/level3/transpose.opencl"
|
||||||
#include "../../kernels/level3/padtranspose.opencl"
|
#include "../../kernels/level3/padtranspose.opencl"
|
||||||
#include "../../kernels/level3/xgemm.opencl"
|
#include "../../kernels/level3/xgemm_part1.opencl"
|
||||||
|
#include "../../kernels/level3/xgemm_part2.opencl"
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -30,13 +30,14 @@ template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDou
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event, const std::string &name):
|
Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
|
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level3/copy.opencl"
|
#include "../../kernels/level3/copy.opencl"
|
||||||
#include "../../kernels/level3/pad.opencl"
|
#include "../../kernels/level3/pad.opencl"
|
||||||
#include "../../kernels/level3/transpose.opencl"
|
#include "../../kernels/level3/transpose.opencl"
|
||||||
#include "../../kernels/level3/padtranspose.opencl"
|
#include "../../kernels/level3/padtranspose.opencl"
|
||||||
#include "../../kernels/level3/xgemm.opencl"
|
#include "../../kernels/level3/xgemm_part1.opencl"
|
||||||
|
#include "../../kernels/level3/xgemm_part2.opencl"
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -30,13 +30,14 @@ template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDoub
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xsyrk<T>::Xsyrk(Queue &queue, Event &event, const std::string &name):
|
Xsyrk<T>::Xsyrk(Queue &queue, Event &event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
|
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level3/copy.opencl"
|
#include "../../kernels/level3/copy.opencl"
|
||||||
#include "../../kernels/level3/pad.opencl"
|
#include "../../kernels/level3/pad.opencl"
|
||||||
#include "../../kernels/level3/transpose.opencl"
|
#include "../../kernels/level3/transpose.opencl"
|
||||||
#include "../../kernels/level3/padtranspose.opencl"
|
#include "../../kernels/level3/padtranspose.opencl"
|
||||||
#include "../../kernels/level3/xgemm.opencl"
|
#include "../../kernels/level3/xgemm_part1.opencl"
|
||||||
|
#include "../../kernels/level3/xgemm_part2.opencl"
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,8 @@ class TuneXgemm {
|
||||||
static std::string GetSources() {
|
static std::string GetSources() {
|
||||||
return
|
return
|
||||||
#include "../src/kernels/common.opencl"
|
#include "../src/kernels/common.opencl"
|
||||||
#include "../src/kernels/level3/xgemm.opencl"
|
#include "../src/kernels/level3/xgemm_part1.opencl"
|
||||||
|
#include "../src/kernels/level3/xgemm_part2.opencl"
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -35,6 +35,7 @@ class TuneXgemv {
|
||||||
return
|
return
|
||||||
#include "../src/kernels/common.opencl"
|
#include "../src/kernels/common.opencl"
|
||||||
#include "../src/kernels/level2/xgemv.opencl"
|
#include "../src/kernels/level2/xgemv.opencl"
|
||||||
|
#include "../src/kernels/level2/xgemv_fast.opencl"
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -60,8 +61,8 @@ class TuneXgemv {
|
||||||
|
|
||||||
// Sets the tuning parameters and their possible values
|
// Sets the tuning parameters and their possible values
|
||||||
static void SetParameters(cltune::Tuner &tuner, const size_t id) {
|
static void SetParameters(cltune::Tuner &tuner, const size_t id) {
|
||||||
tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256, 512, 1024, 1536, 2048});
|
tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256});
|
||||||
tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8});
|
tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
|
||||||
if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
|
if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,7 +73,10 @@ class TuneXgemv {
|
||||||
tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
|
tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
|
static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
|
||||||
|
auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
|
||||||
|
tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
|
||||||
|
}
|
||||||
|
|
||||||
// Sets the base thread configuration
|
// Sets the base thread configuration
|
||||||
static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m}; }
|
static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m}; }
|
||||||
|
@ -108,6 +112,9 @@ class TuneXgemv {
|
||||||
tuner.AddArgumentScalar(0);
|
tuner.AddArgumentScalar(0);
|
||||||
tuner.AddArgumentScalar(1);
|
tuner.AddArgumentScalar(1);
|
||||||
tuner.AddArgumentScalar(0); // Conjugate transpose
|
tuner.AddArgumentScalar(0); // Conjugate transpose
|
||||||
|
tuner.AddArgumentScalar(0); // Additional parameter
|
||||||
|
tuner.AddArgumentScalar(0); // Banded 'kl'
|
||||||
|
tuner.AddArgumentScalar(0); // Banded 'ku'
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes how to compute the performance metrics
|
// Describes how to compute the performance metrics
|
||||||
|
|
129
src/tuning/xger.cc
Normal file
129
src/tuning/xger.cc
Normal file
|
@ -0,0 +1,129 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "internal/utilities.h"
|
||||||
|
#include "internal/tuning.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class TuneXger {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// The representative kernel and the source code
|
||||||
|
static std::string KernelFamily() { return "xger"; }
|
||||||
|
static std::string KernelName() { return "Xger"; }
|
||||||
|
static std::string GetSources() {
|
||||||
|
return
|
||||||
|
#include "../src/kernels/common.opencl"
|
||||||
|
#include "../src/kernels/level2/level2.opencl"
|
||||||
|
#include "../src/kernels/level2/xger.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The list of arguments relevant for this routine
|
||||||
|
static std::vector<std::string> GetOptions() { return {kArgN, kArgM, kArgAlpha}; }
|
||||||
|
|
||||||
|
// Tests for valid arguments
|
||||||
|
static void TestValidArguments(const Arguments<T> &) { }
|
||||||
|
|
||||||
|
// Sets the default values for the arguments
|
||||||
|
static size_t DefaultM() { return 1024; }
|
||||||
|
static size_t DefaultN() { return 1024; }
|
||||||
|
static size_t DefaultK() { return 1; } // N/A for this kernel
|
||||||
|
static double DefaultFraction() { return 1.0; } // N/A for this kernel
|
||||||
|
|
||||||
|
// Describes how to obtain the sizes of the buffers
|
||||||
|
static size_t GetSizeX(const Arguments<T> &args) { return args.m; }
|
||||||
|
static size_t GetSizeY(const Arguments<T> &args) { return args.n; }
|
||||||
|
static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
|
||||||
|
static size_t GetSizeB(const Arguments<T> &) { return 1; } // N/A for this kernel
|
||||||
|
static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
|
||||||
|
static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
|
||||||
|
|
||||||
|
// Sets the tuning parameters and their possible values
|
||||||
|
static void SetParameters(cltune::Tuner &tuner, const size_t id) {
|
||||||
|
tuner.AddParameter(id, "WGS1", {4, 8, 16, 32, 64, 128, 256, 512});
|
||||||
|
tuner.AddParameter(id, "WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256});
|
||||||
|
tuner.AddParameter(id, "WPT", {1, 2, 4});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sets the constraints and local memory size
|
||||||
|
static void SetConstraints(cltune::Tuner &, const size_t) { }
|
||||||
|
static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
|
||||||
|
|
||||||
|
// Sets the base thread configuration
|
||||||
|
static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
|
||||||
|
static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
|
||||||
|
static std::vector<size_t> LocalSize() { return {1, 1}; }
|
||||||
|
static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
|
||||||
|
|
||||||
|
// Transforms the thread configuration based on the parameters
|
||||||
|
using TransformVector = std::vector<std::vector<std::string>>;
|
||||||
|
static TransformVector MulLocal() { return {{"WGS1", "WGS2"}}; }
|
||||||
|
static TransformVector DivLocal() { return {}; }
|
||||||
|
static TransformVector MulGlobal() { return {}; }
|
||||||
|
static TransformVector DivGlobal() { return {{"WPT", "WPT"}}; }
|
||||||
|
|
||||||
|
// Sets the kernel's arguments
|
||||||
|
static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
|
||||||
|
std::vector<T> &x_vec, std::vector<T> &y_vec,
|
||||||
|
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
|
||||||
|
std::vector<T> &) {
|
||||||
|
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||||
|
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||||
|
tuner.AddArgumentScalar(args.alpha);
|
||||||
|
tuner.AddArgumentInput(x_vec);
|
||||||
|
tuner.AddArgumentScalar(0); // x_offset
|
||||||
|
tuner.AddArgumentScalar(1); // x_increment
|
||||||
|
tuner.AddArgumentInput(y_vec);
|
||||||
|
tuner.AddArgumentScalar(0); // y_offset
|
||||||
|
tuner.AddArgumentScalar(1); // y_increment
|
||||||
|
tuner.AddArgumentOutput(a_mat);
|
||||||
|
tuner.AddArgumentScalar(0); // a_offset
|
||||||
|
tuner.AddArgumentScalar(static_cast<int>(args.m)); // a_ld
|
||||||
|
tuner.AddArgumentScalar(0); // a_is_rowmajor
|
||||||
|
}
|
||||||
|
|
||||||
|
// Describes how to compute the performance metrics
|
||||||
|
static size_t GetMetric(const Arguments<T> &args) {
|
||||||
|
return (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision);
|
||||||
|
}
|
||||||
|
static std::string PerformanceUnit() { return "GB/s"; }
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// Shortcuts to the clblast namespace
|
||||||
|
using float2 = clblast::float2;
|
||||||
|
using double2 = clblast::double2;
|
||||||
|
|
||||||
|
// Main function (not within the clblast namespace)
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
switch(clblast::GetPrecision(argc, argv)) {
|
||||||
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
|
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXger<float>, float>(argc, argv); break;
|
||||||
|
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXger<double>, double>(argc, argv); break;
|
||||||
|
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXger<float2>, float2>(argc, argv); break;
|
||||||
|
case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXger<double2>, double2>(argc, argv); break;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
|
@ -103,7 +103,13 @@ std::string ToString(Precision value) {
|
||||||
// both the real and imaginary parts.
|
// both the real and imaginary parts.
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T ConvertArgument(const char* value) {
|
T ConvertArgument(const char* value) {
|
||||||
return static_cast<T>(std::stod(value));
|
return static_cast<T>(std::stoi(value));
|
||||||
|
}
|
||||||
|
template <> float ConvertArgument(const char* value) {
|
||||||
|
return static_cast<float>(std::stod(value));
|
||||||
|
}
|
||||||
|
template <> double ConvertArgument(const char* value) {
|
||||||
|
return static_cast<double>(std::stod(value));
|
||||||
}
|
}
|
||||||
template <> float2 ConvertArgument(const char* value) {
|
template <> float2 ConvertArgument(const char* value) {
|
||||||
auto val = static_cast<float>(std::stod(value));
|
auto val = static_cast<float>(std::stod(value));
|
||||||
|
@ -139,7 +145,6 @@ T GetArgument(const int argc, char *argv[], std::string &help,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compiles the above function
|
// Compiles the above function
|
||||||
template bool GetArgument<bool>(const int, char **, std::string&, const std::string&, const bool);
|
|
||||||
template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int);
|
template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int);
|
||||||
template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t);
|
template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t);
|
||||||
template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float);
|
template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float);
|
||||||
|
@ -156,9 +161,9 @@ template Precision GetArgument<Precision>(const int, char **, std::string&, cons
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Returns only the precision argument
|
// Returns only the precision argument
|
||||||
Precision GetPrecision(const int argc, char *argv[]) {
|
Precision GetPrecision(const int argc, char *argv[], const Precision default_precision) {
|
||||||
auto dummy = std::string{};
|
auto dummy = std::string{};
|
||||||
return GetArgument(argc, argv, dummy, kArgPrecision, Precision::kSingle);
|
return GetArgument(argc, argv, dummy, kArgPrecision, default_precision);
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -35,7 +35,7 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
|
||||||
const Routine run_routine, const Routine run_reference,
|
const Routine run_routine, const Routine run_reference,
|
||||||
const ResultGet get_result, const ResultIndex get_index,
|
const ResultGet get_result, const ResultIndex get_index,
|
||||||
const ResultIterator get_id1, const ResultIterator get_id2):
|
const ResultIterator get_id1, const ResultIterator get_id2):
|
||||||
Tester<T,U>{argc, argv, silent, name, options},
|
Tester<T,U>(argc, argv, silent, name, options),
|
||||||
run_routine_(run_routine),
|
run_routine_(run_routine),
|
||||||
run_reference_(run_reference),
|
run_reference_(run_reference),
|
||||||
get_result_(get_result),
|
get_result_(get_result),
|
||||||
|
|
|
@ -80,11 +80,11 @@ template <typename T, typename U>
|
||||||
Tester<T,U>::~Tester() {
|
Tester<T,U>::~Tester() {
|
||||||
if (PrecisionSupported<T>(device_)) {
|
if (PrecisionSupported<T>(device_)) {
|
||||||
fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
|
fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
|
||||||
fprintf(stdout, " %lu test(s) passed\n", tests_passed_);
|
fprintf(stdout, " %zu test(s) passed\n", tests_passed_);
|
||||||
if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
|
if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
|
||||||
fprintf(stdout, " %lu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str());
|
fprintf(stdout, " %zu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str());
|
||||||
if (tests_failed_ > 0) { fprintf(stdout, "%s", kPrintError.c_str()); }
|
if (tests_failed_ > 0) { fprintf(stdout, "%s", kPrintError.c_str()); }
|
||||||
fprintf(stdout, " %lu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
|
fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
clblasTeardown();
|
clblasTeardown();
|
||||||
|
@ -129,29 +129,29 @@ void Tester<T,U>::TestEnd() {
|
||||||
fprintf(stdout, " Status code %d (expected %d): ", entry.status_found, entry.status_expect);
|
fprintf(stdout, " Status code %d (expected %d): ", entry.status_found, entry.status_expect);
|
||||||
}
|
}
|
||||||
for (auto &o: options_) {
|
for (auto &o: options_) {
|
||||||
if (o == kArgM) { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); }
|
if (o == kArgM) { fprintf(stdout, "%s=%zu ", kArgM, entry.args.m); }
|
||||||
if (o == kArgN) { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); }
|
if (o == kArgN) { fprintf(stdout, "%s=%zu ", kArgN, entry.args.n); }
|
||||||
if (o == kArgK) { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); }
|
if (o == kArgK) { fprintf(stdout, "%s=%zu ", kArgK, entry.args.k); }
|
||||||
if (o == kArgKU) { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); }
|
if (o == kArgKU) { fprintf(stdout, "%s=%zu ", kArgKU, entry.args.ku); }
|
||||||
if (o == kArgKL) { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); }
|
if (o == kArgKL) { fprintf(stdout, "%s=%zu ", kArgKL, entry.args.kl); }
|
||||||
if (o == kArgLayout) { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
|
if (o == kArgLayout) { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
|
||||||
if (o == kArgATransp) { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
|
if (o == kArgATransp) { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
|
||||||
if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
|
if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
|
||||||
if (o == kArgSide) { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
|
if (o == kArgSide) { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
|
||||||
if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
|
if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
|
||||||
if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
|
if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
|
||||||
if (o == kArgXInc) { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
|
if (o == kArgXInc) { fprintf(stdout, "%s=%zu ", kArgXInc, entry.args.x_inc);}
|
||||||
if (o == kArgYInc) { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
|
if (o == kArgYInc) { fprintf(stdout, "%s=%zu ", kArgYInc, entry.args.y_inc);}
|
||||||
if (o == kArgXOffset) { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
|
if (o == kArgXOffset) { fprintf(stdout, "%s=%zu ", kArgXOffset, entry.args.x_offset);}
|
||||||
if (o == kArgYOffset) { fprintf(stdout, "%s=%lu ", kArgYOffset, entry.args.y_offset);}
|
if (o == kArgYOffset) { fprintf(stdout, "%s=%zu ", kArgYOffset, entry.args.y_offset);}
|
||||||
if (o == kArgALeadDim) { fprintf(stdout, "%s=%lu ", kArgALeadDim, entry.args.a_ld);}
|
if (o == kArgALeadDim) { fprintf(stdout, "%s=%zu ", kArgALeadDim, entry.args.a_ld);}
|
||||||
if (o == kArgBLeadDim) { fprintf(stdout, "%s=%lu ", kArgBLeadDim, entry.args.b_ld);}
|
if (o == kArgBLeadDim) { fprintf(stdout, "%s=%zu ", kArgBLeadDim, entry.args.b_ld);}
|
||||||
if (o == kArgCLeadDim) { fprintf(stdout, "%s=%lu ", kArgCLeadDim, entry.args.c_ld);}
|
if (o == kArgCLeadDim) { fprintf(stdout, "%s=%zu ", kArgCLeadDim, entry.args.c_ld);}
|
||||||
if (o == kArgAOffset) { fprintf(stdout, "%s=%lu ", kArgAOffset, entry.args.a_offset);}
|
if (o == kArgAOffset) { fprintf(stdout, "%s=%zu ", kArgAOffset, entry.args.a_offset);}
|
||||||
if (o == kArgBOffset) { fprintf(stdout, "%s=%lu ", kArgBOffset, entry.args.b_offset);}
|
if (o == kArgBOffset) { fprintf(stdout, "%s=%zu ", kArgBOffset, entry.args.b_offset);}
|
||||||
if (o == kArgCOffset) { fprintf(stdout, "%s=%lu ", kArgCOffset, entry.args.c_offset);}
|
if (o == kArgCOffset) { fprintf(stdout, "%s=%zu ", kArgCOffset, entry.args.c_offset);}
|
||||||
if (o == kArgAPOffset) { fprintf(stdout, "%s=%lu ", kArgAPOffset, entry.args.ap_offset);}
|
if (o == kArgAPOffset) { fprintf(stdout, "%s=%zu ", kArgAPOffset, entry.args.ap_offset);}
|
||||||
if (o == kArgDotOffset){ fprintf(stdout, "%s=%lu ", kArgDotOffset, entry.args.dot_offset);}
|
if (o == kArgDotOffset){ fprintf(stdout, "%s=%zu ", kArgDotOffset, entry.args.dot_offset);}
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
@ -159,18 +159,18 @@ void Tester<T,U>::TestEnd() {
|
||||||
// Prints a test summary
|
// Prints a test summary
|
||||||
auto pass_rate = 100*num_passed_ / static_cast<float>(num_passed_ + num_skipped_ + num_failed_);
|
auto pass_rate = 100*num_passed_ / static_cast<float>(num_passed_ + num_skipped_ + num_failed_);
|
||||||
fprintf(stdout, " Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str());
|
fprintf(stdout, " Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str());
|
||||||
fprintf(stdout, " %lu passed /", num_passed_);
|
fprintf(stdout, " %zu passed /", num_passed_);
|
||||||
if (num_skipped_ != 0) {
|
if (num_skipped_ != 0) {
|
||||||
fprintf(stdout, " %s%lu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str());
|
fprintf(stdout, " %s%zu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str());
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
fprintf(stdout, " %lu skipped /", num_skipped_);
|
fprintf(stdout, " %zu skipped /", num_skipped_);
|
||||||
}
|
}
|
||||||
if (num_failed_ != 0) {
|
if (num_failed_ != 0) {
|
||||||
fprintf(stdout, " %s%lu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str());
|
fprintf(stdout, " %s%zu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str());
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
fprintf(stdout, " %lu failed\n", num_failed_);
|
fprintf(stdout, " %zu failed\n", num_failed_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -280,21 +280,21 @@ bool TestSimilarity(const T val1, const T val2) {
|
||||||
const auto difference = std::fabs(val1 - val2);
|
const auto difference = std::fabs(val1 - val2);
|
||||||
|
|
||||||
// Set the allowed error margin for floating-point comparisons
|
// Set the allowed error margin for floating-point comparisons
|
||||||
constexpr auto kErrorMarginRelative = 1.0e-2;
|
constexpr auto kErrorMarginRelative = T{0.025};
|
||||||
constexpr auto kErrorMarginAbsolute = 1.0e-10;
|
constexpr auto kErrorMarginAbsolute = T{1.0e-6};
|
||||||
|
|
||||||
// Shortcut, handles infinities
|
// Shortcut, handles infinities
|
||||||
if (val1 == val2) {
|
if (val1 == val2) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// The values are zero or very small: the relative error is less meaningful
|
// The values are zero or very small: the relative error is less meaningful
|
||||||
else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
|
else if (val1 == 0 || val2 == 0 || difference < kErrorMarginAbsolute) {
|
||||||
return (difference < static_cast<T>(kErrorMarginAbsolute));
|
return (difference < kErrorMarginAbsolute);
|
||||||
}
|
}
|
||||||
// Use relative error
|
// Use relative error
|
||||||
else {
|
else {
|
||||||
const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
|
const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
|
||||||
return (difference / absolute_sum) < static_cast<T>(kErrorMarginRelative);
|
return (difference / absolute_sum) < kErrorMarginRelative;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <utility>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
|
||||||
|
@ -48,11 +49,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
|
||||||
for (auto &o: options_) {
|
for (auto &o: options_) {
|
||||||
|
|
||||||
// Data-sizes
|
// Data-sizes
|
||||||
if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); }
|
if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, size_t{512}); }
|
||||||
if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); }
|
if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, size_t{512}); }
|
||||||
if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); }
|
if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, size_t{512}); }
|
||||||
if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, 128UL); }
|
if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, size_t{128}); }
|
||||||
if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, 128UL); }
|
if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, size_t{128}); }
|
||||||
|
|
||||||
// Data-layouts
|
// Data-layouts
|
||||||
if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
|
if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
|
||||||
|
@ -89,7 +90,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
|
||||||
args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
|
args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
|
||||||
args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
|
args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
|
||||||
args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
|
args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
|
||||||
args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, true);
|
args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
|
||||||
args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
|
args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
|
||||||
args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
|
args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
|
||||||
args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
|
args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
|
||||||
|
@ -112,7 +113,7 @@ template <typename T, typename U>
|
||||||
void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {
|
void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {
|
||||||
|
|
||||||
// Prints the header of the output table
|
// Prints the header of the output table
|
||||||
PrintTableHeader(args.silent, options_);
|
PrintTableHeader(args);
|
||||||
|
|
||||||
// Initializes OpenCL and the libraries
|
// Initializes OpenCL and the libraries
|
||||||
auto platform = Platform(args.platform_id);
|
auto platform = Platform(args.platform_id);
|
||||||
|
@ -162,11 +163,16 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
|
||||||
auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, dot};
|
auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, dot};
|
||||||
|
|
||||||
// Runs the routines and collects the timings
|
// Runs the routines and collects the timings
|
||||||
|
auto timings = std::vector<std::pair<std::string, double>>();
|
||||||
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
|
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
|
||||||
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
|
timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
|
||||||
|
if (args.compare_clblas) {
|
||||||
|
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
|
||||||
|
timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
|
||||||
|
}
|
||||||
|
|
||||||
// Prints the performance of both libraries
|
// Prints the performance of the tested libraries
|
||||||
PrintTableRow(args, ms_clblast, ms_clblas);
|
PrintTableRow(args, timings);
|
||||||
|
|
||||||
// Makes the jump to the next step
|
// Makes the jump to the next step
|
||||||
++s;
|
++s;
|
||||||
|
@ -213,20 +219,27 @@ double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &ar
|
||||||
|
|
||||||
// Prints the header of the performance table
|
// Prints the header of the performance table
|
||||||
template <typename T, typename U>
|
template <typename T, typename U>
|
||||||
void Client<T,U>::PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
|
void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
|
||||||
if (!silent) {
|
|
||||||
for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
|
// First line (optional)
|
||||||
fprintf(stdout, " | <-- CLBlast --> | <-- clBLAS --> |\n");
|
if (!args.silent) {
|
||||||
|
for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
|
||||||
|
fprintf(stdout, " | <-- CLBlast -->");
|
||||||
|
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
|
||||||
|
fprintf(stdout, " |\n");
|
||||||
}
|
}
|
||||||
for (auto &argument: args) { fprintf(stdout, "%9s;", argument.c_str()); }
|
|
||||||
fprintf(stdout, "%9s;%9s;%9s;%9s;%9s;%9s\n",
|
// Second line
|
||||||
"ms_1", "GFLOPS_1", "GBs_1", "ms_2", "GFLOPS_2", "GBs_2");
|
for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
|
||||||
|
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
|
||||||
|
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
|
||||||
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Print a performance-result row
|
// Print a performance-result row
|
||||||
template <typename T, typename U>
|
template <typename T, typename U>
|
||||||
void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblast,
|
void Client<T,U>::PrintTableRow(const Arguments<U>& args,
|
||||||
const double ms_clblas) {
|
const std::vector<std::pair<std::string, double>>& timings) {
|
||||||
|
|
||||||
// Creates a vector of relevant variables
|
// Creates a vector of relevant variables
|
||||||
auto integers = std::vector<size_t>{};
|
auto integers = std::vector<size_t>{};
|
||||||
|
@ -261,34 +274,36 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas
|
||||||
else if (o == kArgBeta) { strings.push_back(ToString(args.beta)); }
|
else if (o == kArgBeta) { strings.push_back(ToString(args.beta)); }
|
||||||
}
|
}
|
||||||
|
|
||||||
// Computes the GFLOPS and GB/s metrics
|
|
||||||
auto flops = get_flops_(args);
|
|
||||||
auto bytes = get_bytes_(args);
|
|
||||||
auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
|
|
||||||
auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
|
|
||||||
auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
|
|
||||||
auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;
|
|
||||||
|
|
||||||
// Outputs the argument values
|
// Outputs the argument values
|
||||||
for (auto &argument: integers) {
|
for (auto &argument: integers) {
|
||||||
if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
|
if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
|
||||||
fprintf(stdout, "%8luM;", argument/(1024*1024));
|
fprintf(stdout, "%8zuM;", argument/(1024*1024));
|
||||||
}
|
}
|
||||||
else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
|
else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
|
||||||
fprintf(stdout, "%8luK;", argument/1024);
|
fprintf(stdout, "%8zuK;", argument/1024);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
fprintf(stdout, "%9lu;", argument);
|
fprintf(stdout, "%9zu;", argument);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto &argument: strings) {
|
for (auto &argument: strings) {
|
||||||
fprintf(stdout, "%9s;", argument.c_str());
|
fprintf(stdout, "%9s;", argument.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Outputs the performance numbers
|
// Loops over all tested libraries
|
||||||
fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf;%9.2lf;%9.1lf;%9.1lf\n",
|
for (const auto& timing : timings) {
|
||||||
ms_clblast, gflops_clblast, gbs_clblast,
|
|
||||||
ms_clblas, gflops_clblas, gbs_clblas);
|
// Computes the GFLOPS and GB/s metrics
|
||||||
|
auto flops = get_flops_(args);
|
||||||
|
auto bytes = get_bytes_(args);
|
||||||
|
auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0;
|
||||||
|
auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0;
|
||||||
|
|
||||||
|
// Outputs the performance numbers
|
||||||
|
if (timing.first != "CLBlast") { fprintf(stdout, ";"); }
|
||||||
|
fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", timing.second, gflops, gbs);
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
// The libraries to test
|
// The libraries to test
|
||||||
#include <clBLAS.h>
|
#include <clBLAS.h>
|
||||||
|
@ -64,10 +65,11 @@ class Client {
|
||||||
Queue &queue, Routine run_blas, const std::string &library_name);
|
Queue &queue, Routine run_blas, const std::string &library_name);
|
||||||
|
|
||||||
// Prints the header of a performance-data table
|
// Prints the header of a performance-data table
|
||||||
void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
|
void PrintTableHeader(const Arguments<U>& args);
|
||||||
|
|
||||||
// Prints a row of performance data, including results of two libraries
|
// Prints a row of performance data, including results of two libraries
|
||||||
void PrintTableRow(const Arguments<U>& args, const double ms_clblast, const double ms_clblas);
|
void PrintTableRow(const Arguments<U>& args,
|
||||||
|
const std::vector<std::pair<std::string, double>>& timings);
|
||||||
|
|
||||||
// The routine-specific functions passed to the tester
|
// The routine-specific functions passed to the tester
|
||||||
const Routine run_routine_;
|
const Routine run_routine_;
|
||||||
|
|
|
@ -63,7 +63,7 @@ main <- function(routine_name, precision, test_names, test_values,
|
||||||
if (precision == 64) { display_name <- gsub("^X","D",display_name); }
|
if (precision == 64) { display_name <- gsub("^X","D",display_name); }
|
||||||
if (precision == 3232) { display_name <- gsub("^X","C",display_name); }
|
if (precision == 3232) { display_name <- gsub("^X","C",display_name); }
|
||||||
if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
|
if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
|
||||||
executable <- paste("./client_", routine_name, sep="")
|
executable <- paste("./clblast_client_", routine_name, sep="")
|
||||||
|
|
||||||
# Configures the outputfile
|
# Configures the outputfile
|
||||||
pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)
|
pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle:
|
case clblast::Precision::kSingle:
|
||||||
clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break;
|
clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break;
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle:
|
case clblast::Precision::kSingle:
|
||||||
clblast::RunClient<clblast::TestXcopy<float>, float, float>(argc, argv); break;
|
clblast::RunClient<clblast::TestXcopy<float>, float, float>(argc, argv); break;
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle:
|
case clblast::Precision::kSingle:
|
||||||
clblast::RunClient<clblast::TestXdot<float>, float, float>(argc, argv); break;
|
clblast::RunClient<clblast::TestXdot<float>, float, float>(argc, argv); break;
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle:
|
case clblast::Precision::kSingle:
|
||||||
clblast::RunClient<clblast::TestXscal<float>, float, float>(argc, argv); break;
|
clblast::RunClient<clblast::TestXscal<float>, float, float>(argc, argv); break;
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle:
|
case clblast::Precision::kSingle:
|
||||||
clblast::RunClient<clblast::TestXswap<float>, float, float>(argc, argv); break;
|
clblast::RunClient<clblast::TestXswap<float>, float, float>(argc, argv); break;
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle:
|
case clblast::Precision::kSingle:
|
||||||
clblast::RunClient<clblast::TestXgbmv<float>, float, float>(argc, argv); break;
|
clblast::RunClient<clblast::TestXgbmv<float>, float, float>(argc, argv); break;
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle:
|
case clblast::Precision::kSingle:
|
||||||
clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break;
|
clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break;
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle:
|
case clblast::Precision::kSingle:
|
||||||
clblast::RunClient<clblast::TestXger<float>, float, float>(argc, argv); break;
|
clblast::RunClient<clblast::TestXger<float>, float, float>(argc, argv); break;
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
||||||
|
|
|
@ -18,7 +18,7 @@ using double2 = clblast::double2;
|
||||||
|
|
||||||
// Main function (not within the clblast namespace)
|
// Main function (not within the clblast namespace)
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
switch(clblast::GetPrecision(argc, argv)) {
|
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
|
||||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
|
||||||
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue