Merge pull request #31 from CNugteren/development

Update to version 0.6.0
This commit is contained in:
Cedric Nugteren 2016-03-13 11:05:51 +01:00
commit d190becd89
137 changed files with 6198 additions and 1463 deletions

3
.gitignore vendored
View file

@ -1,4 +1,5 @@
build
stash
.*
*.pyc
*.pyc
*.db

View file

@ -1,29 +1,69 @@
language: cpp
sudo: required
dist: trusty
compiler:
- gcc
- clang
addons:
apt:
sources:
# kubuntu-backports contains newer versions of cmake to install
- kubuntu-backports
packages:
- cmake
env:
global:
- CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/make/release
- OPENCL_REGISTRY=https://www.khronos.org/registry/cl
- OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl
before_install:
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
- sudo add-apt-repository -y ppa:kalakris/cmake
- sudo apt-get update -qq
- sudo apt-get install -qq gcc-4.8 g++-4.8 clang
- sudo apt-get install -qq fglrx=2:8.960-0ubuntu1 opencl-headers
- sudo apt-get install -qq cmake
- cmake --version;
- ${CC} --version;
- ${CXX} --version;
install:
- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
# The following linux logic is necessary because of Travis's move to the GCE platform, which does not
# currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221
# We build our own linkable .so file
- if [ ${TRAVIS_OS_NAME} == "linux" ]; then
mkdir -p ${OPENCL_ROOT};
pushd ${OPENCL_ROOT};
travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git;
mv ./OpenCL-ICD-Loader/* .;
travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL;
pushd inc/CL;
travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp;
popd;
mkdir -p lib;
pushd lib;
cmake -G "Unix Makefiles" ..;
make;
cp ./bin/libOpenCL.so .;
popd;
pushd inc/CL;
travis_retry git fetch origin opencl12:opencl12;
git checkout opencl12;
popd;
mv inc/ include/;
popd;
fi
before_script:
- mkdir install
- export PATH=`pwd`/install/bin:${PATH}
- export LD_LIBRARY_PATH=`pwd`/install/lib64:`pwd`/install/lib:${LD_LIBRARY_PATH}
- mkdir build
- cd build
- cmake -DCMAKE_INSTALL_PREFIX:PATH=../install ..
- mkdir -p ${CLBLAST_ROOT}
- pushd ${CLBLAST_ROOT}
- cmake -DOPENCL_ROOT=${OPENCL_ROOT} ${TRAVIS_BUILD_DIR}
script:
- make
- make install
branches:
only:
- master
- development
notifications:
email: false

View file

@ -1,4 +1,21 @@
Version 0.6.0
- Added support for MSVC (Visual Studio) 2015
- Added tuned parameters for various devices (see README)
- Now automatically generates C++ code from JSON tuning results
- Added level-2 routines:
* SGER/DGER
* CGERU/ZGERU
* CGERC/ZGERC
* CHER/ZHER
* CHPR/ZHPR
* CHER2/ZHER2
* CHPR2/ZHPR2
* CSYR/ZSYR
* CSPR/ZSPR
* CSYR2/ZSYR2
* CSPR2/ZSPR2
Version 0.5.0
- Improved structure and performance of level-2 routines (xSYMV/xHEMV)
- Reduced compilation time of level-3 OpenCL kernels

View file

@ -13,7 +13,7 @@
cmake_minimum_required(VERSION 2.8.10)
project("clblast" C CXX)
set(clblast_VERSION_MAJOR 0)
set(clblast_VERSION_MINOR 5)
set(clblast_VERSION_MINOR 6)
set(clblast_VERSION_PATCH 0)
# Options and their default values
@ -55,16 +55,21 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
endif()
# C++ compiler settings
set(FLAGS "-O3 -std=c++11")
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
set(FLAGS "/Ox")
set(FLAGS "${FLAGS} /wd4715")
else ()
set(FLAGS "-O3 -std=c++11")
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
@ -102,14 +107,15 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
# ==================================================================================================
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy pad transpose padtranspose xaxpy xdot xgemv xgemm)
set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sgemm)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
set(PRECISIONS 32 3232 64 6464)
set(PRECISIONS 32 64 3232 6464)
# ==================================================================================================

View file

@ -6,7 +6,7 @@ CLBlast: The tuned OpenCL BLAS library
CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
__Note that the CLBlast library is actively being developed, and might not be mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details (and how to tune yourself).
Why CLBlast and not clBLAS or cuBLAS?
@ -17,6 +17,9 @@ Use CLBlast instead of clBLAS:
* When you care about achieving maximum performance.
* When you want to be able to inspect the BLAS kernels or easily customize them to your needs.
* When you run on exotic OpenCL devices which you need to tune yourself.
* When you are still running on OpenCL 1.1 hardware.
* When you value an organized and modern C++ codebase.
* When you target Intel CPUs and GPUs or embedded devices
Use CLBlast instead of cuBLAS:
@ -41,10 +44,13 @@ The pre-requisites for compilation of CLBlast are:
- Clang 3.3 or newer
- AppleClang 5.0 or newer
- ICC 14.0 or newer
- MSVC (Visual Studio) 2015 or newer
* An OpenCL 1.1 or newer library, for example:
- Apple OpenCL
- NVIDIA CUDA SDK
- AMD APP SDK
- Intel OpenCL
- Beignet
An example of an out-of-source build (starting from the root of the CLBlast folder):
@ -79,13 +85,27 @@ Using the tuners (optional)
The CLBlast library will be tuned in the future for the most commonly used OpenCL devices. This pre-release of CLBlast is only tuned for a limited number of devices, in particular those with the following `CL_DEVICE_NAME` values:
* NVIDIA GPUs:
- GeForce GTX480
- GeForce GTX 480
- GeForce GTX 680
- GeForce GTX 750 Ti
- GeForce GTX 980
- GeForce GTX Titan
- GeForce GTX Titan X
- Tesla K20m
- Tesla K40m
* AMD GPUs:
- Tahiti
- R9 M370X
* Intel GPUs:
- Iris
- Iris Pro
* Intel CPUs:
- Core i5-6200U
- Core i7-3770K
- Core i7-5930K
* Other devices:
- ARM Mali-T628 GPU
- Intel MIC
If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners:
@ -93,9 +113,19 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 1.7.0 or higher). CLTune is available from GitHub.
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance.
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
The tuner will output a C++ database compatible line with the results, which can be added to `include/internal/database/xxxxx.h` in the appropriate section. Or, if tuning parameters already exist for your device but you believe they can be improved, this is also the place where they can be modified. If you want the found parameters to be included in future releases of CLBlast, please post the JSON output in the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.h` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):
mkdir build
cd build
cmake -DTUNERS=ON ..
make
make alltuners
python ../scripts/database/database.py . ..
make
Compiling the tests (optional)
@ -127,10 +157,11 @@ These graphs can be generated automatically on your own device. First, compile C
Rscript path/to/test/performance/graphs/xgemm.r 0 1
Supported routines
-------------
CLBlast is in active development but already supports the majority of BLAS routines. The currently supported routines are marked with '✔' in the following tables:
CLBlast is in active development but already supports almost all the BLAS routines. The currently supported routines are marked with '✔' in the following tables. Empty boxes represent routines that still need to be implemented in a future release, whereas routines marked with '-' are not part of BLAS at all.
| Level-1 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
@ -149,7 +180,6 @@ CLBlast is in active development but already supports the majority of BLAS routi
| xASUM | | | - | - | +SC +DZ |
| IxAMAX | | | | | |
| Level-2 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
@ -166,17 +196,17 @@ CLBlast is in active development but already supports the majority of BLAS routi
| xTRSV | | | | | |
| xTBSV | | | | | |
| xTPSV | | | | | |
| xGER | | | - | - | |
| xGERU | - | - | | | |
| xGERC | - | - | | | |
| xHER | - | - | | | |
| xHPR | - | - | | | |
| xHER2 | - | - | | | |
| xHPR2 | - | - | | | |
| xSYR | | | - | - | |
| xSPR | | | - | - | |
| xSYR2 | | | - | - | |
| xSPR2 | | | - | - | |
| xGER | ✔ | ✔ | - | - | |
| xGERU | - | - | ✔ | ✔ | |
| xGERC | - | - | ✔ | ✔ | |
| xHER | - | - | ✔ | ✔ | |
| xHPR | - | - | ✔ | ✔ | |
| xHER2 | - | - | ✔ | ✔ | |
| xHPR2 | - | - | ✔ | ✔ | |
| xSYR | ✔ | ✔ | - | - | |
| xSPR | ✔ | ✔ | - | - | |
| xSYR2 | ✔ | ✔ | - | - | |
| xSPR2 | ✔ | ✔ | - | - | |
| Level-3 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
@ -200,6 +230,12 @@ The contributing authors so far are:
* [Cedric Nugteren](http://www.cedricnugteren.nl)
Tuning and testing on a variety of OpenCL devices was made possible by:
* [TU/e ES research group](http://www.es.ele.tue.nl/)
* [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
* [Dividiti](http://www.dividiti.com)
* [SURFsara HPC center](http://www.surfsara.com)
Support us
-------------
@ -210,20 +246,8 @@ This project started in March 2015 as an evenings and weekends free-time project
To-do list before release of version 1.0
-------------
- Increase the functionality:
* Support all routines supported by clBLAS
* Allow the user control over events and synchronization
* Add half-precision routines (e.g. HGEMM)
- Improve host performance:
* Allow initialization to pre-compile kernels and store to disk
- Improve device performance:
* Tune for a wider range of devices
* Allow users to define custom tuned parameters
- Improve the tuning
* Make the tuners upload their data to a central server
- Improve the performance comparisons:
* Enable comparison against optionally: ViennaCL, cuBLAS, MAGMA OpenCL
- Further reduce the likelihood of crashes:
* Add checks for proper command-line arguments in the tuner, tester and client
* Add checks for valid database parameters
* Test in multi-threaded environments
- Support all routines supported by clBLAS
- Allow the user control over events and synchronization
- Add half-precision routines (e.g. HGEMM)
- Enable correctness and performance testing against a CPU-based BLAS library
- Test in multi-threaded environments

View file

@ -34,6 +34,7 @@ set(OPENCL_HINTS
set(OPENCL_PATHS
/usr/local/cuda
/opt/cuda
/opt/intel/opencl
/usr
/usr/local
)
@ -52,7 +53,7 @@ mark_as_advanced(OPENCL_INCLUDE_DIRS)
find_library(OPENCL_LIBRARIES
NAMES OpenCL
HINTS ${OPENCL_HINTS}
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x86_64/sdk lib/x64 lib/x86 lib/Win32 OpenCL/common/lib/x64
PATHS ${OPENCL_PATHS}
DOC "OpenCL library"
)

View file

@ -45,7 +45,7 @@ mark_as_advanced(CLBLAS_INCLUDE_DIRS)
find_library(CLBLAS_LIBRARIES
NAMES clBLAS
HINTS ${CLBLAS_HINTS}
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
PATHS ${CLBLAS_PATHS}
DOC "clBLAS library"
)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -76,7 +76,7 @@ class Event {
explicit Event(const cl_event event): event_(event) { }
// Regular constructor
explicit Event() { }
explicit Event(): event_(nullptr) { }
// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
// the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
@ -119,6 +119,13 @@ class Platform {
platform_ = platforms[platform_id];
}
// Returns the number of devices on this platform
size_t NumDevices() const {
auto result = cl_uint{0};
CheckError(clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, nullptr, &result));
return static_cast<size_t>(result);
}
// Accessor to the private data-member
const cl_platform_id& operator()() const { return platform_; }
private:
@ -136,11 +143,11 @@ class Device {
// Initialize the device. Note that this constructor can throw exceptions!
explicit Device(const Platform &platform, const size_t device_id) {
auto num_devices = cl_uint{0};
CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, 0, nullptr, &num_devices));
auto num_devices = platform.NumDevices();
if (num_devices == 0) { Error("no devices found"); }
auto devices = std::vector<cl_device_id>(num_devices);
CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, num_devices, devices.data(), nullptr));
CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
devices.data(), nullptr));
if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
device_ = devices[device_id];
}
@ -172,6 +179,7 @@ class Device {
size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
@ -225,7 +233,7 @@ class Device {
auto result = std::string{};
result.resize(bytes);
CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
return std::string{result.c_str()};
return std::string{result.c_str()}; // Removes any trailing '\0'-characters
}
};
@ -342,7 +350,12 @@ class Queue {
queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
delete s; }) {
auto status = CL_SUCCESS;
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
#ifdef CL_VERSION_2_0
cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
*queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
#else
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
#endif
CheckError(status);
}
@ -408,7 +421,7 @@ class BufferHost {
// =================================================================================================
// Enumeration of buffer access types
enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite };
enum class BufferAccess { kReadOnly, kWriteOnly, kReadWrite, kNotOwned };
// C++11 version of 'cl_mem'
template <typename T>
@ -418,13 +431,17 @@ class Buffer {
// Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
explicit Buffer(const cl_mem buffer):
buffer_(new cl_mem),
access_(BufferAccess::kReadWrite) {
access_(BufferAccess::kNotOwned) {
*buffer_ = buffer;
}
// Regular constructor with memory management
// Regular constructor with memory management. If this class does not own the buffer object, then
// the memory will not be freed automatically afterwards.
explicit Buffer(const Context &context, const BufferAccess access, const size_t size):
buffer_(new cl_mem, [](cl_mem* m) { CheckError(clReleaseMemObject(*m)); delete m; }),
buffer_(new cl_mem, [access](cl_mem* m) {
if (access != BufferAccess::kNotOwned) { CheckError(clReleaseMemObject(*m)); }
delete m;
}),
access_(access) {
auto flags = cl_mem_flags{CL_MEM_READ_WRITE};
if (access_ == BufferAccess::kReadOnly) { flags = CL_MEM_READ_ONLY; }
@ -439,57 +456,74 @@ class Buffer {
Buffer<T>(context, BufferAccess::kReadWrite, size) {
}
// Constructs a new buffer based on an existing host-container
template <typename Iterator>
explicit Buffer(const Context &context, const Queue &queue, Iterator start, Iterator end):
Buffer(context, BufferAccess::kReadWrite, static_cast<size_t>(end - start)) {
auto size = static_cast<size_t>(end - start);
auto pointer = &*start;
CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), pointer, 0,
nullptr, nullptr));
queue.Finish();
}
// Copies from device to host: reading the device buffer a-synchronously
void ReadAsync(const Queue &queue, const size_t size, T* host) {
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0,
nullptr, nullptr));
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host) {
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) {
if (host.size() < size) { Error("target host buffer is too small"); }
ReadAsync(queue, size, host.data());
ReadAsync(queue, size, host.data(), offset);
}
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host) {
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) {
if (host.size() < size) { Error("target host buffer is too small"); }
ReadAsync(queue, size, host.data());
ReadAsync(queue, size, host.data(), offset);
}
// Copies from device to host: reading the device buffer
void Read(const Queue &queue, const size_t size, T* host) {
ReadAsync(queue, size, host);
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
ReadAsync(queue, size, host, offset);
queue.Finish();
}
void Read(const Queue &queue, const size_t size, std::vector<T> &host) {
Read(queue, size, host.data());
void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
Read(queue, size, host.data(), offset);
}
void Read(const Queue &queue, const size_t size, BufferHost<T> &host) {
Read(queue, size, host.data());
void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
Read(queue, size, host.data(), offset);
}
// Copies from host to device: writing the device buffer a-synchronously
void WriteAsync(const Queue &queue, const size_t size, const T* host) {
void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
if (GetSize() < size*sizeof(T)) { Error("target device buffer is too small"); }
CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, 0, size*sizeof(T), host, 0,
nullptr, nullptr));
if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host) {
WriteAsync(queue, size, host.data());
void WriteAsync(const Queue &queue, const size_t size, const std::vector<T> &host,
const size_t offset = 0) {
WriteAsync(queue, size, host.data(), offset);
}
void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host) {
WriteAsync(queue, size, host.data());
void WriteAsync(const Queue &queue, const size_t size, const BufferHost<T> &host,
const size_t offset = 0) {
WriteAsync(queue, size, host.data(), offset);
}
// Copies from host to device: writing the device buffer
void Write(const Queue &queue, const size_t size, const T* host) {
WriteAsync(queue, size, host);
void Write(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
WriteAsync(queue, size, host, offset);
queue.Finish();
}
void Write(const Queue &queue, const size_t size, const std::vector<T> &host) {
Write(queue, size, host.data());
void Write(const Queue &queue, const size_t size, const std::vector<T> &host,
const size_t offset = 0) {
Write(queue, size, host.data(), offset);
}
void Write(const Queue &queue, const size_t size, const BufferHost<T> &host) {
Write(queue, size, host.data());
void Write(const Queue &queue, const size_t size, const BufferHost<T> &host,
const size_t offset = 0) {
Write(queue, size, host.data(), offset);
}
// Copies the contents of this buffer into another device buffer
@ -573,6 +607,13 @@ class Kernel {
0, nullptr, &(event())));
}
// As above, but with the default local workgroup size
void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
nullptr, global.data(), nullptr,
0, nullptr, &(event())));
}
// Accessor to the private data-member
const cl_kernel& operator()() const { return *kernel_; }
private:

View file

@ -56,24 +56,26 @@ class Database {
static constexpr auto kDeviceTypeAll = "default";
// The OpenCL device vendors
static constexpr auto kDeviceVendorNVIDIA = "NVIDIA Corporation";
static constexpr auto kDeviceVendorAMD = "Advanced Micro Devices, Inc.";
static constexpr auto kDeviceVendorIntel = "Intel";
static constexpr auto kDeviceVendorAll = "default";
// The OpenCL device names
static constexpr auto kDefaultDevice = "default";
// Alternative names for some OpenCL vendors
const std::unordered_map<std::string,std::string> kVendorNames {
{"Intel(R) Corporation", "Intel"},
{"GenuineIntel", "Intel"},
{"Advanced Micro Devices, Inc.", "AMD"},
{"NVIDIA Corporation", "NVIDIA"},
};
// The database consists of separate database entries, stored together in a vector
static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
static const DatabaseEntry XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
static const DatabaseEntry XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
static const DatabaseEntry TraSingle, TraDouble, TraComplexSingle, TraComplexDouble;
static const DatabaseEntry PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble;
static const DatabaseEntry TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
static const DatabaseEntry PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
static const std::vector<DatabaseEntry> database;
// The constructor

View file

@ -5,9 +5,9 @@
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the Copy kernels.
// This file populates the database with best-found tuning parameters for the 'Copy' kernels.
//
// =================================================================================================
@ -16,54 +16,56 @@ namespace clblast {
const Database::DatabaseEntry Database::CopySingle = {
"Copy", Precision::kSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_WPT",2}, {"COPY_VW",4} } },
{ "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",4}, {"COPY_VW",4} } },
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",4}, {"COPY_VW",2} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",4} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::CopyDouble = {
"Copy", Precision::kDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
{ "Tesla K20m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
{ "Tesla K40m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",2} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
}
@ -73,26 +75,100 @@ const Database::DatabaseEntry Database::CopyDouble = {
const Database::DatabaseEntry Database::CopyComplexSingle = {
"Copy", Precision::kComplexSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_WPT",1}, {"COPY_VW",1} } },
{ "Tesla K20m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",2}, {"COPY_VW",1} } },
{ "Tesla K40m", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::CopyDouble = {
"Copy", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",2} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
}
@ -102,25 +178,49 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
const Database::DatabaseEntry Database::CopyComplexDouble = {
"Copy", Precision::kComplexDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",1}, {"COPY_VW",1} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_WPT",4}, {"COPY_VW",2} } },
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_WPT",1}, {"COPY_VW",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
}

View file

@ -5,9 +5,9 @@
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the Pad kernels.
// This file populates the database with best-found tuning parameters for the 'Pad' kernels.
//
// =================================================================================================
@ -16,54 +16,56 @@ namespace clblast {
const Database::DatabaseEntry Database::PadSingle = {
"Pad", Precision::kSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::PadDouble = {
"Pad", Precision::kDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
}
@ -73,26 +75,108 @@ const Database::DatabaseEntry Database::PadDouble = {
const Database::DatabaseEntry Database::PadComplexSingle = {
"Pad", Precision::kComplexSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::PadDouble = {
"Pad", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
}
@ -102,25 +186,49 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
const Database::DatabaseEntry Database::PadComplexDouble = {
"Pad", Precision::kComplexDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
}

View file

@ -5,37 +5,67 @@
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the PadTranspose kernels.
// This file populates the database with best-found tuning parameters for the 'Padtranspose' kernels.
//
// =================================================================================================
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::PadTraSingle = {
"PadTranspose", Precision::kSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
{ "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
{ "Tesla K40m", { {"PADTRA_TILE",32}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
const Database::DatabaseEntry Database::PadtransposeSingle = {
"Padtranspose", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
kDeviceTypeAll, "default", {
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
},
}
@ -43,27 +73,58 @@ const Database::DatabaseEntry Database::PadTraSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::PadTraDouble = {
"PadTranspose", Precision::kDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
{ "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
{ "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
"Padtranspose", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",4}, {"PADTRA_PAD",0} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
kDeviceTypeAll, "default", {
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
},
}
@ -71,28 +132,51 @@ const Database::DatabaseEntry Database::PadTraDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::PadTraComplexSingle = {
"PadTranspose", Precision::kComplexSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
{ "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
{ "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
}
},
const Database::DatabaseEntry Database::PadtransposeDouble = {
"Padtranspose", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"PADTRA_TILE",16}, {"PADTRA_WPT",2}, {"PADTRA_PAD",0} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
kDeviceTypeAll, "default", {
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
},
}
@ -100,27 +184,51 @@ const Database::DatabaseEntry Database::PadTraComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::PadTraComplexDouble = {
"PadTranspose", Precision::kComplexDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
{ "Tesla K20m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
{ "Tesla K40m", { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",1} } },
}
},
const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
"Padtranspose", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"PADTRA_TILE",8}, {"PADTRA_WPT",2}, {"PADTRA_PAD",1} } },
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"PADTRA_TILE",16}, {"PADTRA_WPT",1}, {"PADTRA_PAD",0} } },
kDeviceTypeAll, "default", {
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
},
}

View file

@ -5,37 +5,67 @@
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the Transpose kernels.
// This file populates the database with best-found tuning parameters for the 'Transpose' kernels.
//
// =================================================================================================
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::TraSingle = {
const Database::DatabaseEntry Database::TransposeSingle = {
"Transpose", Precision::kSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"TRA_DIM",8}, {"TRA_WPT",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
}
@ -43,56 +73,52 @@ const Database::DatabaseEntry Database::TraSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::TraDouble = {
"Transpose", Precision::kDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::TraComplexSingle = {
const Database::DatabaseEntry Database::TransposeComplexSingle = {
"Transpose", Precision::kComplexSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",2}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
}
@ -100,27 +126,97 @@ const Database::DatabaseEntry Database::TraComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::TraComplexDouble = {
"Transpose", Precision::kComplexDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0} } },
}
},
const Database::DatabaseEntry Database::TransposeDouble = {
"Transpose", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1} } },
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"TRA_DIM",16}, {"TRA_WPT",1}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0} } },
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::TransposeComplexDouble = {
"Transpose", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
}

View file

@ -5,9 +5,9 @@
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the Xaxpy kernels.
// This file populates the database with best-found tuning parameters for the 'Xaxpy' kernels.
//
// =================================================================================================
@ -16,26 +16,115 @@ namespace clblast {
const Database::DatabaseEntry Database::XaxpySingle = {
"Xaxpy", Precision::kSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"WGS",128}, {"WPT",1}, {"VW",2} } },
{ "Tesla K20m", { {"WGS",128}, {"WPT",2}, {"VW",2} } },
{ "Tesla K40m", { {"WGS",128}, {"WPT",1}, {"VW",4} } },
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",2} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"WGS",512}, {"WPT",1}, {"VW",1} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
{ "default", { {"VW",2}, {"WGS",1024}, {"WPT",2} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XaxpyComplexSingle = {
"Xaxpy", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
}
@ -45,53 +134,49 @@ const Database::DatabaseEntry Database::XaxpySingle = {
const Database::DatabaseEntry Database::XaxpyDouble = {
"Xaxpy", Precision::kDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"WGS",128}, {"WPT",1}, {"VW",1} } },
{ "Tesla K20m", { {"WGS",512}, {"WPT",1}, {"VW",2} } },
{ "Tesla K40m", { {"WGS",64}, {"WPT",1}, {"VW",2} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",2}, {"WGS",128}, {"WPT",2} } },
{ "default", { {"VW",2}, {"WGS",128}, {"WPT",2} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XaxpyComplexSingle = {
"Xaxpy", Precision::kComplexSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
{ "Tesla K20m", { {"WGS",128}, {"WPT",1}, {"VW",1} } },
{ "Tesla K40m", { {"WGS",128}, {"WPT",2}, {"VW",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
}
@ -101,25 +186,49 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
const Database::DatabaseEntry Database::XaxpyComplexDouble = {
"Xaxpy", Precision::kComplexDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"WGS",128}, {"WPT",2}, {"VW",1} } },
{ "Tesla K20m", { {"WGS",256}, {"WPT",1}, {"VW",1} } },
{ "Tesla K40m", { {"WGS",64}, {"WPT",2}, {"VW",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"WGS",64}, {"WPT",1}, {"VW",1} } },
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",8} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",8}, {"WGS",128}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS",128}, {"WPT",1}, {"VW",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
}

View file

@ -5,9 +5,9 @@
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the Xdot kernels.
// This file populates the database with best-found tuning parameters for the 'Xdot' kernels.
//
// =================================================================================================
@ -16,22 +16,115 @@ namespace clblast {
const Database::DatabaseEntry Database::XdotSingle = {
"Xdot", Precision::kSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
{ "Tahiti", { {"VW",1}, {"WGS1",256}, {"WGS2",256} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"WGS1",512}, {"WGS2",512} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"VW",1}, {"WGS1",512}, {"WGS2",32} } },
{ "Iris Pro", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",256}, {"WGS2",128} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } },
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XdotComplexSingle = {
"Xdot", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "Iris Pro", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "Tesla K20m", { {"VW",1}, {"WGS1",256}, {"WGS2",512} } },
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
}
},
}
@ -41,45 +134,49 @@ const Database::DatabaseEntry Database::XdotSingle = {
const Database::DatabaseEntry Database::XdotDouble = {
"Xdot", Precision::kDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",1024}, {"WGS2",512} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",32}, {"WGS2",512} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "Tesla K40m", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XdotComplexSingle = {
"Xdot", Precision::kComplexSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"WGS1",512}, {"WGS2",512} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } },
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
}
},
}
@ -89,21 +186,49 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
const Database::DatabaseEntry Database::XdotComplexDouble = {
"Xdot", Precision::kComplexDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS1",64}, {"WGS2",64} } },
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",32} } },
}
},
}

View file

@ -5,9 +5,9 @@
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the Xgemm kernels.
// This file populates the database with best-found tuning parameters for the 'Xgemm' kernels.
//
// =================================================================================================
@ -16,56 +16,56 @@ namespace clblast {
const Database::DatabaseEntry Database::XgemmSingle = {
"Xgemm", Precision::kSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"MWG",128}, {"NWG",64}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
{ "Tesla K20m", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",4}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
{ "Tesla K40m", { {"MWG",128}, {"NWG",128}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
{ kDefaultDevice, { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"MWG",128}, {"NWG",128}, {"KWG",32}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",32}, {"NDIMB",8}, {"KWI",2}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"MWG",64}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",8}, {"VWM",4}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
{ "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemmDouble = {
"Xgemm", Precision::kDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
{ "Tesla K20m", { {"MWG",64}, {"NWG",128}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",32}, {"KWI",8}, {"VWM",2}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
{ "Tesla K40m", { {"MWG",64}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",1} } },
{ kDefaultDevice, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",16}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"MWG",128}, {"NWG",64}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
}
},
}
@ -75,27 +75,108 @@ const Database::DatabaseEntry Database::XgemmDouble = {
const Database::DatabaseEntry Database::XgemmComplexSingle = {
"Xgemm", Precision::kComplexSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
{ "Tesla K20m", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",8}, {"KWI",8}, {"VWM",2}, {"VWN",2}, {"STRM",1}, {"STRN",0}, {"SA",1}, {"SB",0} } },
{ "Tesla K40m", { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",0}, {"STRN",1}, {"SA",1}, {"SB",1} } },
{ kDefaultDevice, { {"MWG",32}, {"NWG",64}, {"KWG",16}, {"MDIMC",16}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",1} } },
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"MWG",16}, {"NWG",64}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",8}, {"NDIMB",16}, {"KWI",2}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemmDouble = {
"Xgemm", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",2} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
}
@ -105,29 +186,52 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
const Database::DatabaseEntry Database::XgemmComplexDouble = {
"Xgemm", Precision::kComplexDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
{ "Tesla K20m", { {"MWG",16}, {"NWG",128}, {"KWG",32}, {"MDIMC",8}, {"NDIMC",32}, {"MDIMA",8}, {"NDIMB",32}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",1}, {"SA",1}, {"SB",0} } },
{ "Tesla K40m", { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",32}, {"KWI",8}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",1} } },
{ kDefaultDevice, { {"MWG",16}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",8}, {"KWI",2}, {"VWM",1}, {"VWN",4}, {"STRM",1}, {"STRN",0}, {"SA",0}, {"SB",0} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"MWG",128}, {"NWG",32}, {"KWG",16}, {"MDIMC",32}, {"NDIMC",8}, {"MDIMA",32}, {"NDIMB",16}, {"KWI",8}, {"VWM",2}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",8}, {"VWN",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"MWG",32}, {"NWG",32}, {"KWG",16}, {"MDIMC",8}, {"NDIMC",8}, {"MDIMA",16}, {"NDIMB",16}, {"KWI",1}, {"VWM",1}, {"VWN",1}, {"STRM",1}, {"STRN",1}, {"SA",0}, {"SB",0} } },
kDeviceTypeAll, "default", {
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
}
},
}
};
// =================================================================================================
} // namespace clblast

View file

@ -5,9 +5,9 @@
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the Xgemv kernels.
// This file populates the database with best-found tuning parameters for the 'Xgemv' kernels.
//
// =================================================================================================
@ -16,26 +16,97 @@ namespace clblast {
const Database::DatabaseEntry Database::XgemvSingle = {
"Xgemv", Precision::kSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"WGS2",256}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",4} } },
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"WGS1",256}, {"WPT1",2}, {"WGS2",64}, {"WPT2",4}, {"VW2",4}, {"WGS3",256}, {"WPT3",2}, {"VW3",8} } },
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
{ "Iris Pro", { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
{ "default", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
{ "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Tesla K20m", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvComplexSingle = {
"Xgemv", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Iris Pro", { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
}
@ -45,53 +116,42 @@ const Database::DatabaseEntry Database::XgemvSingle = {
const Database::DatabaseEntry Database::XgemvDouble = {
"Xgemv", Precision::kDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Tesla K20m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvComplexSingle = {
"Xgemv", Precision::kComplexSingle, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ "Iris", { {"WGS1",256}, {"WPT1",1}, {"WGS2",64}, {"WPT2",4}, {"VW2",2}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
}
@ -101,25 +161,35 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
const Database::DatabaseEntry Database::XgemvComplexDouble = {
"Xgemv", Precision::kComplexDouble, {
{ // NVIDIA GPUs
kDeviceTypeGPU, kDeviceVendorNVIDIA, {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // AMD GPUs
kDeviceTypeGPU, kDeviceVendorAMD, {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, kDeviceVendorIntel, {
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Default
kDeviceTypeAll, kDeviceVendorAll, {
{ kDefaultDevice, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
}

View file

@ -0,0 +1,188 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the 'Xger' kernels.
//
// =================================================================================================
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::XgerSingle = {
"Xger", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
{ "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",4} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgerComplexSingle = {
"Xger", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
{ "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgerDouble = {
"Xger", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
{ "default", { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
{ "default", { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgerComplexDouble = {
"Xger", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",4} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
}
},
}
};
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,58 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xger routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XGER_H_
#define CLBLAST_ROUTINES_XGER_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xger: public Routine<T> {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
using Routine<T>::TestMatrixA;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xger(Queue &queue, Event &event, const std::string &name = "GER");
// Templated-precision implementation of the routine
StatusCode DoGer(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XGER_H_
#endif

View file

@ -0,0 +1,46 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xgerc routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XGERC_H_
#define CLBLAST_ROUTINES_XGERC_H_
#include "internal/routines/level2/xger.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xgerc: public Xger<T> {
public:
// Uses the regular Xger routine
using Xger<T>::DoGer;
// Constructor
Xgerc(Queue &queue, Event &event, const std::string &name = "GERC");
// Templated-precision implementation of the routine
StatusCode DoGerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XGERC_H_
#endif

View file

@ -0,0 +1,46 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xgeru routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XGERU_H_
#define CLBLAST_ROUTINES_XGERU_H_
#include "internal/routines/level2/xger.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xgeru: public Xger<T> {
public:
// Uses the regular Xger routine
using Xger<T>::DoGer;
// Constructor
Xgeru(Queue &queue, Event &event, const std::string &name = "GERU");
// Templated-precision implementation of the routine
StatusCode DoGeru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XGERU_H_
#endif

View file

@ -0,0 +1,61 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xher routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XHER_H_
#define CLBLAST_ROUTINES_XHER_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T, typename U>
class Xher: public Routine<T> {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestMatrixA;
using Routine<T>::TestMatrixAP;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xher(Queue &queue, Event &event, const std::string &name = "HER");
// Translates alpha of type 'U' into type 'T'
T GetAlpha(const U alpha);
// Templated-precision implementation of the routine
StatusCode DoHer(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XHER_H_
#endif

View file

@ -0,0 +1,60 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xher2 routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XHER2_H_
#define CLBLAST_ROUTINES_XHER2_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xher2: public Routine<T> {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
using Routine<T>::TestMatrixA;
using Routine<T>::TestMatrixAP;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xher2(Queue &queue, Event &event, const std::string &name = "HER2");
// Templated-precision implementation of the routine
StatusCode DoHer2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XHER2_H_
#endif

View file

@ -0,0 +1,45 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xhpr routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XHPR_H_
#define CLBLAST_ROUTINES_XHPR_H_
#include "internal/routines/level2/xher.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T, typename U>
class Xhpr: public Xher<T,U> {
public:
// Uses the regular Xher routine
using Xher<T,U>::DoHer;
// Constructor
Xhpr(Queue &queue, Event &event, const std::string &name = "HPR");
// Templated-precision implementation of the routine
StatusCode DoHpr(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XHPR_H_
#endif

View file

@ -0,0 +1,46 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xhpr2 routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XHPR2_H_
#define CLBLAST_ROUTINES_XHPR2_H_
#include "internal/routines/level2/xher2.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xhpr2: public Xher2<T> {
public:
// Uses the regular Xher2 routine
using Xher2<T>::DoHer2;
// Constructor
Xhpr2(Queue &queue, Event &event, const std::string &name = "HPR2");
// Templated-precision implementation of the routine
StatusCode DoHpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XHPR2_H_
#endif

View file

@ -0,0 +1,45 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xspr routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XSPR_H_
#define CLBLAST_ROUTINES_XSPR_H_
#include "internal/routines/level2/xher.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xspr: public Xher<T,T> {
public:
// Uses the regular Xher routine
using Xher<T,T>::DoHer;
// Constructor
Xspr(Queue &queue, Event &event, const std::string &name = "SPR");
// Templated-precision implementation of the routine
StatusCode DoSpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XSPR_H_
#endif

View file

@ -0,0 +1,46 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xspr2 routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XSPR2_H_
#define CLBLAST_ROUTINES_XSPR2_H_
#include "internal/routines/level2/xher2.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xspr2: public Xher2<T> {
public:
// Uses the regular Xher2 routine
using Xher2<T>::DoHer2;
// Constructor
Xspr2(Queue &queue, Event &event, const std::string &name = "SPR2");
// Templated-precision implementation of the routine
StatusCode DoSpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XSPR2_H_
#endif

View file

@ -0,0 +1,45 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyr routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XSYR_H_
#define CLBLAST_ROUTINES_XSYR_H_
#include "internal/routines/level2/xher.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xsyr: public Xher<T,T> {
public:
// Uses the regular Xher routine
using Xher<T,T>::DoHer;
// Constructor
Xsyr(Queue &queue, Event &event, const std::string &name = "SYR");
// Templated-precision implementation of the routine
StatusCode DoSyr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XSYR_H_
#endif

View file

@ -0,0 +1,46 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyr2 routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XSYR2_H_
#define CLBLAST_ROUTINES_XSYR2_H_
#include "internal/routines/level2/xher2.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xsyr2: public Xher2<T> {
public:
// Uses the regular Xher2 routine
using Xher2<T>::DoHer2;
// Constructor
Xsyr2(Queue &queue, Event &event, const std::string &name = "SYR2");
// Templated-precision implementation of the routine
StatusCode DoSyr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XSYR2_H_
#endif

View file

@ -127,9 +127,11 @@ void Tuner(int argc, char* argv[]) {
{"precision", precision_string}
};
for (auto &o: C::GetOptions()) {
if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); }
if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); }
if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); }
if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); }
if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); }
if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); }
if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); }
}
tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
}

View file

@ -125,7 +125,7 @@ struct Arguments {
// Tuner-specific arguments
double fraction = 1.0;
// Client-specific arguments
bool compare_clblas = 1;
int compare_clblas = 1;
size_t step = 1;
size_t num_steps = 0;
size_t num_runs = 10;
@ -171,7 +171,8 @@ T GetArgument(const int argc, char *argv[], std::string &help,
const std::string &option, const T default_value);
// Returns the precision only
Precision GetPrecision(const int argc, char *argv[]);
Precision GetPrecision(const int argc, char *argv[],
const Precision default_precision = Precision::kSingle);
// As in "GetArgument", but now only checks whether an argument is given or not
bool CheckArgument(const int argc, char *argv[], std::string &help, const std::string &option);

View file

@ -15,12 +15,36 @@ import os.path
import glob
import re
import json
try:
from urllib.request import urlopen # Python 3
except ImportError:
from urllib2 import urlopen # Python 2
# Additional modules
import pandas as pd
# Server storing a copy of the database
DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db"
# Constants
ATTRIBUTES = ["device", "type", "vendor", "precision", "kernel_family", "arg_m", "arg_n", "arg_k"]
VENDOR_DEFAULT = "default"
DEVICETYPE_DEFAULT = "All"
DEVICENAME_DEFAULT = "default"
# Attributes
DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"]
DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
KERNEL_ATTRIBUTES = ["precision", "kernel_family",
"arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES
# OpenCL vendor names and their short name
VENDOR_NAMES = { "device_vendor": {
"GenuineIntel": "Intel",
"Intel(R) Corporation": "Intel",
"Advanced Micro Devices, Inc.": "AMD",
"NVIDIA Corporation": "NVIDIA",
}}
# Pandas options
pd.set_option('display.width', 1000)
@ -29,6 +53,14 @@ pd.set_option('display.width', 1000)
# Database operations
# ==================================================================================================
# Downloads the database and save it to disk
def DownloadDatabase(filename):
print("## Downloading database from '"+DATABASE_SERVER_URL+"'...")
df = urlopen(DATABASE_SERVER_URL)
output = open(file_db,'wb')
output.write(df.read())
output.close()
# Loads the database from disk
def LoadDatabase(filename):
return pd.read_pickle(filename)
@ -60,15 +92,58 @@ def ConcatenateData(df1, df2):
def RemoveDuplicates(df):
return df.drop_duplicates()
# Bests
def RemoveEntriesByDevice(df, devicename):
return df[df["device"] != devicename]
def GetEntriesByField(df, field, value):
return df[df[field] == value]
# Fixes the problem that some vendors use multiple different names
def SanitizeVendorNames(df):
df = df.replace(VENDOR_NAMES)
return df
# Retrieves the results with the lowest execution times
def GetBestResults(df):
dfbest = pd.DataFrame()
grouped = df.groupby(ATTRIBUTES+["kernel"])
for name, dfgroup in grouped:
bestcase = dfgroup.loc[[dfgroup["time"].idxmin()]]
dfbest = ConcatenateData(dfbest, bestcase)
besttime = dfgroup["time"].min()
bestcase = dfgroup[dfgroup["time"] == besttime].iloc[0]
dfbest = dfbest.append(bestcase, ignore_index=True)
return dfbest
# Sets defaults for devices of the same type/vendor based on the smallest values of all know
# entries. The average might be better for performance but some parameters might not be supported
# on other devices.
def CalculateDefaults(df):
dfdefault = pd.DataFrame()
# Defaults per type/vendor
groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
for name, dfgroup in groups:
default_values = dfgroup.min(axis=0)
default_values["device"] = DEVICENAME_DEFAULT
default_values["device_compute_units"] = 0
default_values["device_core_clock"] = 0
default_values["time"] = 0.0
dfdefault = dfdefault.append(default_values, ignore_index=True)
# Defaults in general
groups = df.groupby(KERNEL_ATTRIBUTES+["kernel"])
for name, dfgroup in groups:
default_values = dfgroup.min(axis=0)
default_values["device_vendor"] = VENDOR_DEFAULT
default_values["device_type"] = DEVICETYPE_DEFAULT
default_values["device"] = DEVICENAME_DEFAULT
default_values["device_compute_units"] = 0
default_values["device_core_clock"] = 0
default_values["time"] = 0.0
dfdefault = dfdefault.append(default_values, ignore_index=True)
# Database with both types of defaults only
return dfdefault
# ==================================================================================================
# C++ header generation
# ==================================================================================================
@ -110,27 +185,28 @@ def GetPrecision(family, precision):
# The C++ device type and vendor
def GetDeviceVendor(vendor, devtype):
return(" { // %s %ss\n kDeviceType%s, kDeviceVendor%s, {\n"
% (vendor, devtype, devtype, vendor))
if vendor == VENDOR_DEFAULT and devtype == DEVICETYPE_DEFAULT:
return(" { // Default\n kDeviceType%s, \"%s\", {\n" % (devtype, vendor))
return(" { // %s %ss\n kDeviceType%s, \"%s\", {\n" % (vendor, devtype, devtype[0].upper() + devtype[1:], vendor))
# Prints the data to a C++ database
def PrintData(df):
def PrintData(df, outputdir):
# Iterates over the kernel families: creates a new file per family
for family, dffamily in df.groupby(["kernel_family"]):
dffamily = dffamily.dropna(axis=1, how='all')
f = open(family+'.h', 'w+')
f = open(os.path.join(outputdir, family+'.h'), 'w+')
f.write(GetHeader(family))
# Loops over the different entries for this family and prints their headers
for precision, dfprecision in dffamily.groupby(["precision"]):
f.write(GetPrecision(family, precision))
for vendor, dfvendor in dfprecision.groupby(["vendor"]):
for devtype, dfdevtype in dfvendor.groupby(["type"]):
for vendor, dfvendor in dfprecision.groupby(["device_vendor"]):
for devtype, dfdevtype in dfvendor.groupby(["device_type"]):
f.write(GetDeviceVendor(vendor, devtype))
for device, dfdevice in dfdevtype.groupby(["device"]):
devicename = "\"%s\"," % device
f.write(" { %-20s { " % devicename)
f.write(" { %-50s { " % devicename)
# Collects the paramaters for this case and prints them
parameters = []
@ -152,57 +228,70 @@ def PrintData(df):
# Checks for the number of command-line arguments
if len(sys.argv) != 3:
print "[ERROR] Usage: database.py <folder_with_json_files> <root_of_clblast>"
print("[ERROR] Usage: database.py <folder_with_json_files> <root_of_clblast>")
sys.exit()
# Parses the command-line arguments
path_json = sys.argv[1]
path_clblast = sys.argv[2]
file_db = path_clblast+"/src/database.db"
glob_json = path_json+"/*.json"
file_db = os.path.join(path_clblast, "scripts", "database", "database.db")
glob_json = os.path.join(path_json, "*.json")
# Checks whether the command-line arguments are valid; exists otherwise
clblast_h = path_clblast+"/include/clblast.h" # Not used but just for validation
clblast_h = os.path.join(path_clblast, "include", "clblast.h") # Not used but just for validation
if not os.path.isfile(clblast_h):
print "[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library"
print("[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library")
sys.exit()
if len(glob.glob(glob_json)) < 1:
print "[ERROR] The path '"+path_json+"' does not contain any JSON files"
sys.exit()
print("## The path '"+path_json+"' does not contain any JSON files")
# ==================================================================================================
# The main body of the script
# ==================================================================================================
# Loads the database if it exists. If not, a new database is initialized
# Downloads the database if a local copy is not present
db_exists = os.path.isfile(file_db)
database = LoadDatabase(file_db) if db_exists else pd.DataFrame()
if not db_exists:
DownloadDatabase(file_db)
# Loads the database from disk
print("## Loading the database from disk...")
database = LoadDatabase(file_db)
# Loops over all JSON files in the supplied folder
for file_json in glob.glob(glob_json):
# Loads the newly imported data
print "## Processing '"+file_json+"'",
sys.stdout.write("## Processing '"+file_json+"' ")
imported_data = ImportDataFromFile(file_json)
imported_data = SanitizeVendorNames(imported_data)
# Adds the new data to the database
old_size = len(database.index)
database = ConcatenateData(database, imported_data)
database = RemoveDuplicates(database)
new_size = len(database.index)
print "with "+str(new_size-old_size)+" new items"
print("with "+str(new_size-old_size)+" new items")
# Stores the new database back to disk
SaveDatabase(database, file_db)
# Stores the modified database back to disk
if len(glob.glob(glob_json)) >= 1:
print("## Storing the database to disk...")
SaveDatabase(database, file_db)
# Retrieves the best performing results
print("## Calculating the best results per device/kernel...")
bests = GetBestResults(database)
# TODO: Determines the defaults for other vendors and per vendor
#defaults = CalculateDefaults(bests)
#bests = ConcatenateData(bests, defaults)
# Determines the defaults for other vendors and per vendor
defaults = CalculateDefaults(bests)
bests = ConcatenateData(bests, defaults)
# Outputs the data as a C++ database
PrintData(bests)
path_cpp_database = os.path.join(path_clblast, "include", "internal", "database")
print("## Producing a C++ database in '"+path_cpp_database+"'...")
PrintData(bests, path_cpp_database)
print("## All done")
# ==================================================================================================

View file

@ -78,17 +78,17 @@ routines = [
Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
# Level 2: matrix update
Routine(False, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
Routine(False, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
Routine(False, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
Routine(False, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
Routine(False, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
Routine(False, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
Routine(False, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
Routine(False, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
Routine(False, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
Routine(False, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
Routine(False, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
Routine(True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
Routine(True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
Routine(True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
Routine(True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
Routine(True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
Routine(True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
Routine(True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
Routine(True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
Routine(True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
Routine(True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
Routine(True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
],
[ # Level 3: matrix-matrix
Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"),
@ -103,7 +103,17 @@ routines = [
]]
# ==================================================================================================
# Translates an option name to a CLBlast data-type
def PrecisionToFullName(x):
return {
'H': "Half",
'S': "Single",
'D': "Double",
'C': "ComplexSingle",
'Z': "ComplexDouble",
}[x]
# ==================================================================================================
# Separators for the BLAS levels
separators = ["""
// =================================================================================================
@ -237,7 +247,7 @@ files = [
path_clblast+"/src/clblast_c.cc",
path_clblast+"/test/wrapper_clblas.h",
]
header_lines = [84, 52, 80, 24, 22]
header_lines = [84, 63, 80, 24, 22]
footer_lines = [6, 3, 5, 2, 6]
# Checks whether the command-line arguments are valid; exists otherwise
@ -315,16 +325,10 @@ for level in [1,2,3]:
body += "using double2 = clblast::double2;\n\n"
body += "// Main function (not within the clblast namespace)\n"
body += "int main(int argc, char *argv[]) {\n"
body += " switch(clblast::GetPrecision(argc, argv)) {\n"
default = PrecisionToFullName(routine.flavours[0].name)
body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
for precision in ["H","S","D","C","Z"]:
enum = {
'H': "Half",
'S': "Single",
'D': "Double",
'C': "ComplexSingle",
'Z': "ComplexDouble",
}[precision]
body += " case clblast::Precision::k"+enum+":"
body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":"
found = False
for flavour in routine.flavours:
if flavour.name == precision:

View file

@ -38,6 +38,17 @@
#include "internal/routines/level2/xtrmv.h"
#include "internal/routines/level2/xtbmv.h"
#include "internal/routines/level2/xtpmv.h"
#include "internal/routines/level2/xger.h"
#include "internal/routines/level2/xgeru.h"
#include "internal/routines/level2/xgerc.h"
#include "internal/routines/level2/xher.h"
#include "internal/routines/level2/xhpr.h"
#include "internal/routines/level2/xher2.h"
#include "internal/routines/level2/xhpr2.h"
#include "internal/routines/level2/xsyr.h"
#include "internal/routines/level2/xspr.h"
#include "internal/routines/level2/xsyr2.h"
#include "internal/routines/level2/xspr2.h"
// BLAS level-3 includes
#include "internal/routines/level3/xgemm.h"
@ -835,14 +846,24 @@ template StatusCode Tpsv<double2>(const Layout, const Triangle, const Transpose,
// General rank-1 matrix update: SGER/DGER
template <typename T>
StatusCode Ger(const Layout,
const size_t, const size_t,
const T,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Ger(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xger<T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoGer(layout,
m, n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc,
Buffer<T>(a_buffer), a_offset, a_ld);
}
template StatusCode Ger<float>(const Layout,
const size_t, const size_t,
@ -861,14 +882,24 @@ template StatusCode Ger<double>(const Layout,
// General rank-1 complex matrix update: CGERU/ZGERU
template <typename T>
StatusCode Geru(const Layout,
const size_t, const size_t,
const T,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Geru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xgeru<T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoGeru(layout,
m, n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc,
Buffer<T>(a_buffer), a_offset, a_ld);
}
template StatusCode Geru<float2>(const Layout,
const size_t, const size_t,
@ -887,14 +918,24 @@ template StatusCode Geru<double2>(const Layout,
// General rank-1 complex conjugated matrix update: CGERC/ZGERC
template <typename T>
StatusCode Gerc(const Layout,
const size_t, const size_t,
const T,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Gerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xgerc<T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoGerc(layout,
m, n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc,
Buffer<T>(a_buffer), a_offset, a_ld);
}
template StatusCode Gerc<float2>(const Layout,
const size_t, const size_t,
@ -913,13 +954,22 @@ template StatusCode Gerc<double2>(const Layout,
// Hermitian rank-1 matrix update: CHER/ZHER
template <typename T>
StatusCode Her(const Layout, const Triangle,
const size_t,
const T,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Her(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xher<std::complex<T>,T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHer(layout, triangle,
n,
alpha,
Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
Buffer<std::complex<T>>(a_buffer), a_offset, a_ld);
}
template StatusCode Her<float>(const Layout, const Triangle,
const size_t,
@ -936,13 +986,22 @@ template StatusCode Her<double>(const Layout, const Triangle,
// Hermitian packed rank-1 matrix update: CHPR/ZHPR
template <typename T>
StatusCode Hpr(const Layout, const Triangle,
const size_t,
const T,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Hpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xhpr<std::complex<T>,T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHpr(layout, triangle,
n,
alpha,
Buffer<std::complex<T>>(x_buffer), x_offset, x_inc,
Buffer<std::complex<T>>(ap_buffer), ap_offset);
}
template StatusCode Hpr<float>(const Layout, const Triangle,
const size_t,
@ -959,14 +1018,24 @@ template StatusCode Hpr<double>(const Layout, const Triangle,
// Hermitian rank-2 matrix update: CHER2/ZHER2
template <typename T>
StatusCode Her2(const Layout, const Triangle,
const size_t,
const T,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Her2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xher2<T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHer2(layout, triangle,
n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc,
Buffer<T>(a_buffer), a_offset, a_ld);
}
template StatusCode Her2<float2>(const Layout, const Triangle,
const size_t,
@ -985,14 +1054,24 @@ template StatusCode Her2<double2>(const Layout, const Triangle,
// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
template <typename T>
StatusCode Hpr2(const Layout, const Triangle,
const size_t,
const T,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Hpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xhpr2<T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoHpr2(layout, triangle,
n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc,
Buffer<T>(ap_buffer), ap_offset);
}
template StatusCode Hpr2<float2>(const Layout, const Triangle,
const size_t,
@ -1011,13 +1090,22 @@ template StatusCode Hpr2<double2>(const Layout, const Triangle,
// Symmetric rank-1 matrix update: SSYR/DSYR
template <typename T>
StatusCode Syr(const Layout, const Triangle,
const size_t,
const T,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Syr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xsyr<T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSyr(layout, triangle,
n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(a_buffer), a_offset, a_ld);
}
template StatusCode Syr<float>(const Layout, const Triangle,
const size_t,
@ -1034,13 +1122,22 @@ template StatusCode Syr<double>(const Layout, const Triangle,
// Symmetric packed rank-1 matrix update: SSPR/DSPR
template <typename T>
StatusCode Spr(const Layout, const Triangle,
const size_t,
const T,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Spr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xspr<T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSpr(layout, triangle,
n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(ap_buffer), ap_offset);
}
template StatusCode Spr<float>(const Layout, const Triangle,
const size_t,
@ -1057,14 +1154,24 @@ template StatusCode Spr<double>(const Layout, const Triangle,
// Symmetric rank-2 matrix update: SSYR2/DSYR2
template <typename T>
StatusCode Syr2(const Layout, const Triangle,
const size_t,
const T,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Syr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xsyr2<T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSyr2(layout, triangle,
n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc,
Buffer<T>(a_buffer), a_offset, a_ld);
}
template StatusCode Syr2<float>(const Layout, const Triangle,
const size_t,
@ -1083,14 +1190,24 @@ template StatusCode Syr2<double>(const Layout, const Triangle,
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
template <typename T>
StatusCode Spr2(const Layout, const Triangle,
const size_t,
const T,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*) {
return StatusCode::kNotImplemented;
StatusCode Spr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xspr2<T>(queue_cpp, event_cpp);
auto status = routine.SetUp();
if (status != StatusCode::kSuccess) { return status; }
return routine.DoSpr2(layout, triangle,
n,
alpha,
Buffer<T>(x_buffer), x_offset, x_inc,
Buffer<T>(y_buffer), y_offset, y_inc,
Buffer<T>(ap_buffer), ap_offset);
}
template StatusCode Spr2<float>(const Layout, const Triangle,
const size_t,

View file

@ -15,6 +15,7 @@
#include "internal/database/xaxpy.h"
#include "internal/database/xdot.h"
#include "internal/database/xgemv.h"
#include "internal/database/xger.h"
#include "internal/database/xgemm.h"
#include "internal/database/copy.h"
#include "internal/database/pad.h"
@ -31,11 +32,12 @@ const std::vector<Database::DatabaseEntry> Database::database = {
XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
TraSingle, TraDouble, TraComplexSingle, TraComplexDouble,
PadTraSingle, PadTraDouble, PadTraComplexSingle, PadTraComplexDouble
TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
};
// =================================================================================================
@ -77,19 +79,29 @@ Database::Parameters Database::Search(const std::string &this_kernel,
const std::string &this_vendor,
const std::string &this_device,
const Precision this_precision) const {
// Set the short vendor name
auto this_short_vendor = this_vendor;
for (auto &combination : kVendorNames) {
if (this_vendor == combination.first) {
this_short_vendor = combination.second;
}
}
// Selects the right kernel
for (auto &db: database) {
if (db.kernel == this_kernel && db.precision == this_precision) {
// Searches for the right vendor and device type, or selects the default if unavailable. This
// assumes that the default vendor / device type is last in the database.
for (auto &vendor: db.vendors) {
if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
(vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) &&
(vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
// Searches for the right device. If the current device is unavailable, selects the vendor
// default parameters. This assumes the default is last in the database.
for (auto &device: vendor.devices) {
if (device.name == this_device || device.name == kDefaultDevice) {
if (device.name == this_device || device.name == "default") {
// Sets the parameters accordingly
return device.parameters;

View file

@ -0,0 +1,158 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains common functions for matrix update kernels (Xger, Xher).
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef WGS1
#define WGS1 8 // The local work-group size in first dimension
#endif
#ifndef WGS2
#define WGS2 8 // The local work-group size in second dimension
#endif
#ifndef WPT
#define WPT 1 // The amount of work-per-thread in both dimensions
#endif
// =================================================================================================
// Returns an element from a vector
inline real LoadVector(const int id, const int max,
__global real* gm, const int offset, const int inc,
const int do_conjugate) {
if (id < max) {
real result = gm[id*inc + offset];
if (do_conjugate) {
#if defined(ROUTINE_GERC) || defined(ROUTINE_HER) || defined(ROUTINE_HPR) || defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
COMPLEX_CONJUGATE(result);
#endif
}
return result;
}
else {
real default_result;
SetToZero(default_result);
return default_result;
}
}
// Performs the rank-1 matrix update
inline void MatrixUpdate(const int id1, const int id2, const int max1, const int max2,
__global real* agm, const int a_offset, const int a_ld,
const real alpha, const real xvalue, const real yvalue,
const int is_upper) {
// Bounds of a regular matrix
if (id1 < max1 && id2 < max2) {
#if defined(ROUTINE_SPR) || defined(ROUTINE_HPR)
int a_index;
if (is_upper) {
a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2;
}
else {
a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2;
}
a_index += a_offset;
#else
const int a_index = id2*a_ld + id1 + a_offset;
#endif
// Loads the current value of the A matrix
const real avalue = agm[a_index];
// Computes result = alpha * x[i] * y[j] + a[i][j]
#if PRECISION == 3232 || PRECISION == 6464
real ax;
ax.x = MulReal(alpha, xvalue);
ax.y = MulImag(alpha, xvalue);
real result;
result.x = MulReal(ax, yvalue) + avalue.x;
result.y = MulImag(ax, yvalue) + avalue.y;
#else
real result = alpha * xvalue * yvalue + avalue;
#endif
// For hermetian matrices
#if defined(ROUTINE_HER) || defined(ROUTINE_HPR)
if (id1 == id2) { result.y = ZERO; }
#endif
// Stores the final result
agm[a_index] = result;
}
}
// Performs the rank-2 matrix update
inline void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2,
__global real* agm, const int a_offset, const int a_ld,
const real alpha1, const real xvalue, const real yvalue,
const real alpha2, const real xtvalue, const real ytvalue,
const int is_upper) {
// Bounds of a regular matrix
if (id1 < max1 && id2 < max2) {
#if defined(ROUTINE_SPR2) || defined(ROUTINE_HPR2)
int a_index;
if (is_upper) {
a_index = (id1 <= id2) ? ((id2+1)*id2)/2 + id1 : ((id1+1)*id1)/2 + id2;
}
else {
a_index = (id1 >= id2) ? ((2*a_ld-(id2+1))*id2)/2 + id1 : ((2*a_ld-(id1+1))*id1)/2 + id2;
}
a_index += a_offset;
#else
const int a_index = id2*a_ld + id1 + a_offset;
#endif
// Loads the current value of the A matrix
const real avalue = agm[a_index];
// Computes result = alpha * x[i] * y[j] + alpha * x[j] * y[i] + a[i][j]
#if PRECISION == 3232 || PRECISION == 6464
real ax;
ax.x = MulReal(alpha2, xvalue);
ax.y = MulImag(alpha2, xvalue);
real atx;
atx.x = MulReal(alpha1, xtvalue);
atx.y = MulImag(alpha1, xtvalue);
real result;
result.x = MulReal(ax, yvalue) + MulReal(atx, ytvalue) + avalue.x;
result.y = MulImag(ax, yvalue) + MulImag(atx, ytvalue) + avalue.y;
#else
real result = alpha1 * xvalue * yvalue + alpha2 * xtvalue * ytvalue + avalue;
#endif
// For hermetian matrices
#if defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
if (id1 == id2) { result.y = ZERO; }
#endif
// Stores the final result
agm[a_index] = result;
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -7,7 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xgemv kernel for matrix-vector multiplication.
// This file contains the Xgemv kernel (generic version) for matrix-vector multiplication.
//
// =================================================================================================
@ -27,56 +27,11 @@ R"(
#ifndef WPT1
#define WPT1 1 // The amount of work-per-thread
#endif
// 2: For the fast version
#ifndef WGS2
#define WGS2 64 // The local work-group size
#endif
#ifndef WPT2
#define WPT2 1 // The amount of work-per-thread
#endif
#ifndef VW2
#define VW2 1 // Vector width of matrix A loads
#ifndef UNROLL1
#define UNROLL1 32 // Unroll factor (must be a divider of WGS1)
#endif
// 3: For the fast rotated version
#ifndef WGS3
#define WGS3 64 // The local work-group size
#endif
#ifndef WPT3
#define WPT3 1 // The amount of work-per-thread
#endif
#ifndef VW3
#define VW3 1 // Vector width of matrix A loads
#endif
// =================================================================================================
// Data-widths for the 'fast' kernel
#if VW2 == 1
typedef real realVF;
#elif VW2 == 2
typedef real2 realVF;
#elif VW2 == 4
typedef real4 realVF;
#elif VW2 == 8
typedef real8 realVF;
#elif VW2 == 16
typedef real16 realVF;
#endif
// Data-widths for the 'fast' kernel with rotated matrix
#if VW3 == 1
typedef real realVFR;
#elif VW3 == 2
typedef real2 realVFR;
#elif VW3 == 4
typedef real4 realVFR;
#elif VW3 == 8
typedef real8 realVFR;
#elif VW3 == 16
typedef real16 realVFR;
#endif
// 2 and 3: For the fast versions, see 'xgemv_fast.opencl'
// =================================================================================================
@ -252,18 +207,6 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
return result;
}
// Loads a vector input value (1/2)
inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
const int a_ld) {
return agm[a_ld*y + x];
}
// Loads a vector input value (2/2): as before, but different data-type
inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
const int a_ld) {
return agm[a_ld*y + x];
}
// =================================================================================================
// Full version of the kernel
@ -301,28 +244,31 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
barrier(CLK_LOCAL_MEM_FENCE);
// Loops over the work per thread, and checks whether in bounds
#pragma unroll
for (int w=0; w<WPT1; ++w) {
const int gid = w*get_global_size(0) + get_global_id(0);
if (gid < m) {
// The multiply-add function for the main part (divisable by WGS1)
if (a_rotated == 0) { // Not rotated
#pragma unroll
for (int kloop=0; kloop<WGS1; ++kloop) {
const int k = kwg + kloop;
real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xlm[kloop], value);
for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) {
#pragma unroll
for (int kunroll=0; kunroll<UNROLL1; ++kunroll) {
const int k = kwg + kloop + kunroll;
real value = LoadMatrixA(agm, gid, k, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xlm[kloop + kunroll], value);
}
}
}
else { // Transposed
#pragma unroll
for (int kloop=0; kloop<WGS1; ++kloop) {
const int k = kwg + kloop;
real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xlm[kloop], value);
for (int kloop=0; kloop<WGS1; kloop+=UNROLL1) {
#pragma unroll
for (int kunroll=0; kunroll<UNROLL1; ++kunroll) {
const int k = kwg + kloop + kunroll;
real value = LoadMatrixA(agm, k, gid, a_ld, a_offset, parameter, kl, ku);
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
MultiplyAdd(acc[w], xlm[kloop + kunroll], value);
}
}
}
}
@ -365,200 +311,6 @@ __kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
// =================================================================================================
// Faster version of the kernel, assuming that:
// --> 'm' and 'n' are multiples of WGS2
// --> 'a_offset' is 0
// --> 'a_ld' is a multiple of VW2
// --> 'a_rotated' is 0
// --> 'do_conjugate' is 0
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
const int a_rotated,
const __global realVF* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl, const int ku) {
// Local memory for the vector X
__local real xlm[WGS2];
// Initializes the accumulation register
real acc[WPT2];
#pragma unroll
for (int w=0; w<WPT2; ++w) {
SetToZero(acc[w]);
}
// Loops over work-group sized portions of the work
for (int kwg=0; kwg<n; kwg+=WGS2) {
// Loads the vector X into local memory
const int lid = get_local_id(0);
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
// The multiply-add function (not rotated)
#pragma unroll
for (int kl=0; kl<WGS2; ++kl) {
const int k = kwg + kl;
#pragma unroll
for (int w=0; w<WPT2/VW2; ++w) {
const int gid = (WPT2/VW2)*get_global_id(0) + w;
realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
#if VW2 == 1
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
#elif VW2 == 2
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
#elif VW2 == 4
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.z);
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.w);
#elif VW2 == 8
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
#elif VW2 == 16
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
MultiplyAdd(acc[VW2*w+8], xlm[kl], avec.s8);
MultiplyAdd(acc[VW2*w+9], xlm[kl], avec.s9);
MultiplyAdd(acc[VW2*w+10], xlm[kl], avec.sA);
MultiplyAdd(acc[VW2*w+11], xlm[kl], avec.sB);
MultiplyAdd(acc[VW2*w+12], xlm[kl], avec.sC);
MultiplyAdd(acc[VW2*w+13], xlm[kl], avec.sD);
MultiplyAdd(acc[VW2*w+14], xlm[kl], avec.sE);
MultiplyAdd(acc[VW2*w+15], xlm[kl], avec.sF);
#endif
}
}
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the final result
#pragma unroll
for (int w=0; w<WPT2; ++w) {
const int gid = WPT2*get_global_id(0) + w;
real yval = ygm[gid*y_inc + y_offset];
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
}
}
// =================================================================================================
// Faster version of the kernel, assuming that:
// --> 'm' and 'n' are multiples of WGS3
// --> 'a_offset' is 0
// --> 'a_ld' is a multiple of VW3
// --> 'a_rotated' is 1
// --> 'do_conjugate' is 0
__attribute__((reqd_work_group_size(WGS3, 1, 1)))
__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
const int a_rotated,
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl, const int ku) {
// Local memory for the vector X
__local real xlm[WGS3];
// Initializes the accumulation register
real acc[WPT3];
#pragma unroll
for (int w=0; w<WPT3; ++w) {
SetToZero(acc[w]);
}
// Loops over work-group sized portions of the work
for (int kwg=0; kwg<n; kwg+=WGS3) {
// Loads the vector X into local memory
const int lid = get_local_id(0);
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
// The multiply-add function (rotated)
#pragma unroll
for (int kl=0; kl<WGS3/VW3; ++kl) {
const int k = (kwg/VW3) + kl;
#pragma unroll
for (int w=0; w<WPT3; ++w) {
const int gid = WPT3*get_global_id(0) + w;
realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
#if VW3 == 1
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
#elif VW3 == 2
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
#elif VW3 == 4
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
#elif VW3 == 8
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
#elif VW3 == 16
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
#endif
}
}
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the final result
#pragma unroll
for (int w=0; w<WPT3; ++w) {
const int gid = WPT3*get_global_id(0) + w;
real yval = ygm[gid*y_inc + y_offset];
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"

View file

@ -0,0 +1,288 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xgemv kernel (fast versions) for matrix-vector multiplication.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
// 1: For the full version, see 'xgemv.opencl'
// 2: For the fast version
#ifndef WGS2
#define WGS2 64 // The local work-group size
#endif
#ifndef WPT2
#define WPT2 1 // The amount of work-per-thread
#endif
#ifndef VW2
#define VW2 1 // Vector width of matrix A loads
#endif
// 3: For the fast rotated version
#ifndef WGS3
#define WGS3 64 // The local work-group size
#endif
#ifndef WPT3
#define WPT3 1 // The amount of work-per-thread
#endif
#ifndef VW3
#define VW3 1 // Vector width of matrix A loads
#endif
// =================================================================================================
// Data-widths for the 'fast' kernel
#if VW2 == 1
typedef real realVF;
#elif VW2 == 2
typedef real2 realVF;
#elif VW2 == 4
typedef real4 realVF;
#elif VW2 == 8
typedef real8 realVF;
#elif VW2 == 16
typedef real16 realVF;
#endif
// Data-widths for the 'fast' kernel with rotated matrix
#if VW3 == 1
typedef real realVFR;
#elif VW3 == 2
typedef real2 realVFR;
#elif VW3 == 4
typedef real4 realVFR;
#elif VW3 == 8
typedef real8 realVFR;
#elif VW3 == 16
typedef real16 realVFR;
#endif
// =================================================================================================
// Loads a vector input value (1/2)
inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
const int a_ld) {
return agm[a_ld*y + x];
}
// Loads a vector input value (2/2): as before, but different data-type
inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
const int a_ld) {
return agm[a_ld*y + x];
}
// =================================================================================================
// Faster version of the kernel, assuming that:
// --> 'm' and 'n' are multiples of WGS2
// --> 'a_offset' is 0
// --> 'a_ld' is a multiple of VW2
// --> 'a_rotated' is 0
// --> 'do_conjugate' is 0
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
const int a_rotated,
const __global realVF* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl, const int ku) {
// Local memory for the vector X
__local real xlm[WGS2];
// Initializes the accumulation register
real acc[WPT2];
#pragma unroll
for (int w=0; w<WPT2; ++w) {
SetToZero(acc[w]);
}
// Loops over work-group sized portions of the work
for (int kwg=0; kwg<n; kwg+=WGS2) {
// Loads the vector X into local memory
const int lid = get_local_id(0);
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
// The multiply-add function (not rotated)
#pragma unroll
for (int kl=0; kl<WGS2; ++kl) {
const int k = kwg + kl;
#pragma unroll
for (int w=0; w<WPT2/VW2; ++w) {
const int gid = (WPT2/VW2)*get_global_id(0) + w;
realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
#if VW2 == 1
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
#elif VW2 == 2
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
#elif VW2 == 4
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.x);
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.y);
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.z);
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.w);
#elif VW2 == 8
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
#elif VW2 == 16
MultiplyAdd(acc[VW2*w+0], xlm[kl], avec.s0);
MultiplyAdd(acc[VW2*w+1], xlm[kl], avec.s1);
MultiplyAdd(acc[VW2*w+2], xlm[kl], avec.s2);
MultiplyAdd(acc[VW2*w+3], xlm[kl], avec.s3);
MultiplyAdd(acc[VW2*w+4], xlm[kl], avec.s4);
MultiplyAdd(acc[VW2*w+5], xlm[kl], avec.s5);
MultiplyAdd(acc[VW2*w+6], xlm[kl], avec.s6);
MultiplyAdd(acc[VW2*w+7], xlm[kl], avec.s7);
MultiplyAdd(acc[VW2*w+8], xlm[kl], avec.s8);
MultiplyAdd(acc[VW2*w+9], xlm[kl], avec.s9);
MultiplyAdd(acc[VW2*w+10], xlm[kl], avec.sA);
MultiplyAdd(acc[VW2*w+11], xlm[kl], avec.sB);
MultiplyAdd(acc[VW2*w+12], xlm[kl], avec.sC);
MultiplyAdd(acc[VW2*w+13], xlm[kl], avec.sD);
MultiplyAdd(acc[VW2*w+14], xlm[kl], avec.sE);
MultiplyAdd(acc[VW2*w+15], xlm[kl], avec.sF);
#endif
}
}
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the final result
#pragma unroll
for (int w=0; w<WPT2; ++w) {
const int gid = WPT2*get_global_id(0) + w;
real yval = ygm[gid*y_inc + y_offset];
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
}
}
// =================================================================================================
// Faster version of the kernel, assuming that:
// --> 'm' and 'n' are multiples of WGS3
// --> 'a_offset' is 0
// --> 'a_ld' is a multiple of VW3
// --> 'a_rotated' is 1
// --> 'do_conjugate' is 0
__attribute__((reqd_work_group_size(WGS3, 1, 1)))
__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
const int a_rotated,
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl, const int ku) {
// Local memory for the vector X
__local real xlm[WGS3];
// Initializes the accumulation register
real acc[WPT3];
#pragma unroll
for (int w=0; w<WPT3; ++w) {
SetToZero(acc[w]);
}
// Loops over work-group sized portions of the work
for (int kwg=0; kwg<n; kwg+=WGS3) {
// Loads the vector X into local memory
const int lid = get_local_id(0);
xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
// The multiply-add function (rotated)
#pragma unroll
for (int kl=0; kl<WGS3/VW3; ++kl) {
const int k = (kwg/VW3) + kl;
#pragma unroll
for (int w=0; w<WPT3; ++w) {
const int gid = WPT3*get_global_id(0) + w;
realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
#if VW3 == 1
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
#elif VW3 == 2
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
#elif VW3 == 4
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
#elif VW3 == 8
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
#elif VW3 == 16
MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
#endif
}
}
// Synchronizes all threads in a workgroup
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the final result
#pragma unroll
for (int w=0; w<WPT3; ++w) {
const int gid = WPT3*get_global_id(0) + w;
real yval = ygm[gid*y_inc + y_offset];
AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -0,0 +1,106 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xger kernels for rank-1 matrix update.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
__kernel void Xger(const int max1, const int max2, const real alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* ygm, const int y_offset, const int y_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_rowmajor) {
// Register storage for X and Y
real xvalues[WPT];
real yvalues[WPT];
// Row-major version
if (is_rowmajor) {
// Loads the X-vector
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id2 = w*get_global_size(1) + get_global_id(1);
xvalues[w] = LoadVector(id2, max2, xgm, x_offset, x_inc, false);
}
// Loads the Y-vector
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id1 = w*get_global_size(0) + get_global_id(0);
yvalues[w] = LoadVector(id1, max1, ygm, y_offset, y_inc, true);
}
// Loops over the work per thread twice
#pragma unroll
for (int w1=0; w1<WPT; ++w1) {
#pragma unroll
for (int w2=0; w2<WPT; ++w2) {
// Global thread IDs
const int id1 = w1*get_global_size(0) + get_global_id(0);
const int id2 = w2*get_global_size(1) + get_global_id(1);
// Loads A, performs the operation, and stores the result into A
MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld,
alpha, xvalues[w2], yvalues[w1], false);
}
}
}
// Col-major version
else {
// Loads the X-vector
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id1 = w*get_global_size(0) + get_global_id(0);
xvalues[w] = LoadVector(id1, max1, xgm, x_offset, x_inc, false);
}
// Loads the Y-vector
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id2 = w*get_global_size(1) + get_global_id(1);
yvalues[w] = LoadVector(id2, max2, ygm, y_offset, y_inc, true);
}
// Loops over the work per thread twice
#pragma unroll
for (int w1=0; w1<WPT; ++w1) {
#pragma unroll
for (int w2=0; w2<WPT; ++w2) {
// Global thread IDs
const int id1 = w1*get_global_size(0) + get_global_id(0);
const int id2 = w2*get_global_size(1) + get_global_id(1);
// Loads A, performs the operation, and stores the result into A
MatrixUpdate(id1, id2, max1, max2, agm, a_offset, a_ld,
alpha, xvalues[w1], yvalues[w2], false);
}
}
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -0,0 +1,73 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xher kernels for rank-1 matrix update.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
__kernel void Xher(const int n, const real alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_upper, const int is_rowmajor) {
// Register storage for X and XT
real xvalues[WPT];
real xtvalues[WPT];
// Loads the X-vector
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id2 = w*get_global_size(1) + get_global_id(1);
xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
}
// Loads the X-transposed-vector
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id1 = w*get_global_size(0) + get_global_id(0);
xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
}
// Loops over the work per thread twice
#pragma unroll
for (int w1=0; w1<WPT; ++w1) {
#pragma unroll
for (int w2=0; w2<WPT; ++w2) {
// Global thread IDs
const int id1 = w1*get_global_size(0) + get_global_id(0);
const int id2 = w2*get_global_size(1) + get_global_id(1);
// Skip these threads if they do not contain threads contributing to the matrix-triangle
if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
// Do nothing
}
// Loads A, performs the operation, and stores the result into A
else {
MatrixUpdate(id1, id2, n, n, agm, a_offset, a_ld, alpha, xvalues[w2], xtvalues[w1], is_upper);
}
}
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -0,0 +1,104 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xher2 kernels for rank-2 matrix update.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
__kernel void Xher2(const int n, const real alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_upper, const int is_rowmajor) {
// Register storage for X and Y
real xvalues[WPT];
real yvalues[WPT];
real xtvalues[WPT];
real ytvalues[WPT];
// Loads the X-vector
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id2 = w*get_global_size(1) + get_global_id(1);
xvalues[w] = LoadVector(id2, n, xgm, x_offset, x_inc, !is_rowmajor);
}
// Loads the X-transposed-vector
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id1 = w*get_global_size(0) + get_global_id(0);
xtvalues[w] = LoadVector(id1, n, xgm, x_offset, x_inc, is_rowmajor);
}
// Loads the Y-vector
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id1 = w*get_global_size(0) + get_global_id(0);
yvalues[w] = LoadVector(id1, n, ygm, y_offset, y_inc, is_rowmajor);
}
// Loads the Y-transposed-vector
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id2 = w*get_global_size(1) + get_global_id(1);
ytvalues[w] = LoadVector(id2, n, ygm, y_offset, y_inc, !is_rowmajor);
}
// Sets the proper value of alpha in case conjugation is needed
real alpha1 = alpha;
real alpha2 = alpha;
#if defined(ROUTINE_HER2) || defined(ROUTINE_HPR2)
if (is_rowmajor) {
COMPLEX_CONJUGATE(alpha1);
}
else {
COMPLEX_CONJUGATE(alpha2);
}
#endif
// Loops over the work per thread twice
#pragma unroll
for (int w1=0; w1<WPT; ++w1) {
#pragma unroll
for (int w2=0; w2<WPT; ++w2) {
// Global thread IDs
const int id1 = w1*get_global_size(0) + get_global_id(0);
const int id2 = w2*get_global_size(1) + get_global_id(1);
// Skip these threads if they do not contain threads contributing to the matrix-triangle
if ((is_upper && (id1 > id2)) || (!is_upper && (id2 > id1))) {
// Do nothing
}
// Loads A, performs the operation, and stores the result into A
else {
MatrixUpdate2(id1, id2, n, n, agm, a_offset, a_ld,
alpha1, xvalues[w2], yvalues[w1],
alpha2, xtvalues[w1], ytvalues[w2], is_upper);
}
}
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -0,0 +1,329 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
//
// Matrices are accessed as follows:
// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
//
// Or as an image (assuming column-major)
// K
// o-------o
// | |
// N | [B^T] |
// | |
// o-------o
// K N
// o-------o o-----o
// M | [A] | M | [C] |
// | | | |
// o-------o o-----o
//
//
// This kernel is seperated into two files. This is part 1 out of 2,
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef MWG
#define MWG 8 // Tile-size in dimension M (e.g. 64, 128)
#endif
#ifndef NWG
#define NWG 8 // Tile-size in dimension N (e.g. 64, 128)
#endif
#ifndef KWG
#define KWG 8 // Tile-size in dimension K (e.g. 8, 16)
#endif
#ifndef MDIMC
#define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
#endif
#ifndef NDIMC
#define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
#endif
#ifndef MDIMA
#define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
#endif
#ifndef NDIMB
#define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
#endif
#ifndef KWI
#define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG)
#endif
#ifndef VWM
#define VWM 1 // Vector width of matrices A and C
#endif
#ifndef VWN
#define VWN 1 // Vector width of matrix B
#endif
#ifndef STRM
#define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0)
#endif
#ifndef STRN
#define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0)
#endif
#ifndef SA
#define SA 0 // Use local/shared memory to cache matrix A (1) or not (0)
#endif
#ifndef SB
#define SB 0 // Use local/shared memory to cache matrix B (1) or not (0)
#endif
// Helper parameters based on the above tuning parameters
#define MWI (MWG/MDIMC) // Work per work-item (M-dimension)
#define NWI (NWG/NDIMC) // Work per work-item (N-dimension)
#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension)
#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension)
#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension)
#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension)
// Settings
#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually
// =================================================================================================
// Data-widths in dimension M
#if VWM == 1
typedef real realM;
#elif VWM == 2
typedef real2 realM;
#elif VWM == 4
typedef real4 realM;
#elif VWM == 8
typedef real8 realM;
#elif VWM == 16
typedef real16 realM;
#endif
// Data-widths in dimension N
#if VWN == 1
typedef real realN;
#elif VWN == 2
typedef real2 realN;
#elif VWN == 4
typedef real4 realN;
#elif VWN == 8
typedef real8 realN;
#elif VWN == 16
typedef real16 realN;
#endif
// =================================================================================================
// Initializes the accumulation registers to zero
inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#pragma unroll
for (int ni=0; ni<NWI; ++ni) {
#if VWM == 1
SetToZero(cpm[ni][mi]);
#elif VWM == 2
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
#elif VWM == 4
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
SetToZero(cpm[ni][mi].z);
SetToZero(cpm[ni][mi].w);
#elif VWM == 8
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
#elif VWM == 16
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
SetToZero(cpm[ni][mi].s8);
SetToZero(cpm[ni][mi].s9);
SetToZero(cpm[ni][mi].sA);
SetToZero(cpm[ni][mi].sB);
SetToZero(cpm[ni][mi].sC);
SetToZero(cpm[ni][mi].sD);
SetToZero(cpm[ni][mi].sE);
SetToZero(cpm[ni][mi].sF);
#endif
}
}
}
// =================================================================================================
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix.
#if SA == 1
inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
const int kSizeM, const int tid, const int kwg) {
const int la0 = tid % MDIMA;
const int la1 = tid / MDIMA;
#pragma unroll
for (int mia=0; mia<MWA/VWM; ++mia) {
#pragma unroll
for (int kia=0; kia<KWA; ++kia) {
// Computes the indices based on strided/non-strided access
#if STRM == 0
int mg = mia + la0*(MWA/VWM);
#elif STRM == 1
int mg = la0 + mia*MDIMA;
#endif
// Computes the indices for the global memory
int kg = kia + la1*KWA;
int idm = mg + get_group_id(0)*(MWG/VWM);
int idk = kg + kwg;
// Loads the data from global memory (not transposed) into the local memory
alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
}
}
}
#endif
// Same as above, but now for the B input matrix
#if SB == 1
inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
const int kSizeN, const int tid, const int kwg) {
const int lb0 = tid % NDIMB;
const int lb1 = tid / NDIMB;
#pragma unroll
for (int kib=0; kib<KWB; ++kib) {
#pragma unroll
for (int nib=0; nib<NWB/VWN; ++nib) {
// Computes the indices based on strided/non-strided access
#if STRN == 0
int ng = nib + lb0*(NWB/VWN);
#elif STRN == 1
int ng = lb0 + nib*NDIMB;
#endif
// Computes the indices for the global memory
int kg = kib + lb1*KWB;
int idn = ng + get_group_id(1)*(NWG/VWN);
int idk = kg + kwg;
// Loads the data from global memory (transposed) into the local memory
blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
}
}
}
#endif
// =================================================================================================
// Caches global off-chip memory directly into per-thread private memory (registers). This function
// is specific for caching the A input matrix.
#if SA == 0
inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
const int kSizeM, const int idk, const int kwg) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
// Computes the indices based on strided/non-strided access
#if STRM == 0
int mg = mi + get_local_id(0)*(MWI/VWM);
#elif STRM == 1
int mg = get_local_id(0) + mi*MDIMC;
#endif
// Computes the indices for the global memory
int idm = mg + get_group_id(0)*(MWG/VWM);
// Loads the data from global memory (not transposed) and stores into registers
apm[mi] = agm[idk*(kSizeM/VWM) + idm];
}
}
#endif
// Same as above, but now for the B input matrix
#if SB == 0
inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
const int kSizeN, const int idk) {
#pragma unroll
for (int ni=0; ni<NWI/VWN; ++ni) {
// Computes the indices based on strided/non-strided access
#if STRN == 0
int ng = ni + get_local_id(1)*(NWI/VWN);
#elif STRN == 1
int ng = get_local_id(1) + ni*NDIMC;
#endif
// Computes the indices for the global memory
int idn = ng + get_group_id(1)*(NWG/VWN);
// Loads the data from global memory (transposed) and stores into registers
bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
}
}
#endif
// =================================================================================================
// Caches on-chip local memory into per-thread private memory (registers). This function is specific
// for caching the A input matrix.
#if SA == 1
inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#if STRM == 0
int mg = mi + get_local_id(0)*(MWI/VWM);
#elif STRM == 1
int mg = get_local_id(0) + mi*MDIMC;
#endif
apm[mi] = alm[kg*(MWG/VWM) + mg];
}
}
#endif
// Same as above, but now for the B input matrix
#if SB == 1
inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
#pragma unroll
for (int ni=0; ni<NWI/VWN; ++ni) {
#if STRN == 0
int ng = ni + get_local_id(1)*(NWI/VWN);
#elif STRN == 1
int ng = get_local_id(1) + ni*NDIMC;
#endif
bpm[ni] = blm[kg*(NWG/VWN) + ng];
}
}
#endif
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -7,29 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
//
// Matrices are accessed as follows:
// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
//
// Or as an image (assuming column-major)
// K
// o-------o
// | |
// N | [B^T] |
// | |
// o-------o
// K N
// o-------o o-----o
// M | [A] | M | [C] |
// | | | |
// o-------o o-----o
//
// This is part 2 of 2 of the GEMM kernel. See part 1 for more information.
//
// =================================================================================================
@ -39,288 +17,6 @@ R"(
// =================================================================================================
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef MWG
#define MWG 8 // Tile-size in dimension M (e.g. 64, 128)
#endif
#ifndef NWG
#define NWG 8 // Tile-size in dimension N (e.g. 64, 128)
#endif
#ifndef KWG
#define KWG 8 // Tile-size in dimension K (e.g. 8, 16)
#endif
#ifndef MDIMC
#define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
#endif
#ifndef NDIMC
#define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
#endif
#ifndef MDIMA
#define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
#endif
#ifndef NDIMB
#define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
#endif
#ifndef KWI
#define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG)
#endif
#ifndef VWM
#define VWM 1 // Vector width of matrices A and C
#endif
#ifndef VWN
#define VWN 1 // Vector width of matrix B
#endif
#ifndef STRM
#define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0)
#endif
#ifndef STRN
#define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0)
#endif
#ifndef SA
#define SA 0 // Use local/shared memory to cache matrix A (1) or not (0)
#endif
#ifndef SB
#define SB 0 // Use local/shared memory to cache matrix B (1) or not (0)
#endif
// Helper parameters based on the above tuning parameters
#define MWI (MWG/MDIMC) // Work per work-item (M-dimension)
#define NWI (NWG/NDIMC) // Work per work-item (N-dimension)
#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension)
#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension)
#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension)
#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension)
// Settings
#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually
// =================================================================================================
// Data-widths in dimension M
#if VWM == 1
typedef real realM;
#elif VWM == 2
typedef real2 realM;
#elif VWM == 4
typedef real4 realM;
#elif VWM == 8
typedef real8 realM;
#elif VWM == 16
typedef real16 realM;
#endif
// Data-widths in dimension N
#if VWN == 1
typedef real realN;
#elif VWN == 2
typedef real2 realN;
#elif VWN == 4
typedef real4 realN;
#elif VWN == 8
typedef real8 realN;
#elif VWN == 16
typedef real16 realN;
#endif
// =================================================================================================
// Initializes the accumulation registers to zero
inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#pragma unroll
for (int ni=0; ni<NWI; ++ni) {
#if VWM == 1
SetToZero(cpm[ni][mi]);
#elif VWM == 2
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
#elif VWM == 4
SetToZero(cpm[ni][mi].x);
SetToZero(cpm[ni][mi].y);
SetToZero(cpm[ni][mi].z);
SetToZero(cpm[ni][mi].w);
#elif VWM == 8
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
#elif VWM == 16
SetToZero(cpm[ni][mi].s0);
SetToZero(cpm[ni][mi].s1);
SetToZero(cpm[ni][mi].s2);
SetToZero(cpm[ni][mi].s3);
SetToZero(cpm[ni][mi].s4);
SetToZero(cpm[ni][mi].s5);
SetToZero(cpm[ni][mi].s6);
SetToZero(cpm[ni][mi].s7);
SetToZero(cpm[ni][mi].s8);
SetToZero(cpm[ni][mi].s9);
SetToZero(cpm[ni][mi].sA);
SetToZero(cpm[ni][mi].sB);
SetToZero(cpm[ni][mi].sC);
SetToZero(cpm[ni][mi].sD);
SetToZero(cpm[ni][mi].sE);
SetToZero(cpm[ni][mi].sF);
#endif
}
}
}
// =================================================================================================
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix.
#if SA == 1
inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
const int kSizeM, const int tid, const int kwg) {
const int la0 = tid % MDIMA;
const int la1 = tid / MDIMA;
#pragma unroll
for (int mia=0; mia<MWA/VWM; ++mia) {
#pragma unroll
for (int kia=0; kia<KWA; ++kia) {
// Computes the indices based on strided/non-strided access
#if STRM == 0
int mg = mia + la0*(MWA/VWM);
#elif STRM == 1
int mg = la0 + mia*MDIMA;
#endif
// Computes the indices for the global memory
int kg = kia + la1*KWA;
int idm = mg + get_group_id(0)*(MWG/VWM);
int idk = kg + kwg;
// Loads the data from global memory (not transposed) into the local memory
alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
}
}
}
#endif
// Same as above, but now for the B input matrix
#if SB == 1
inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
const int kSizeN, const int tid, const int kwg) {
const int lb0 = tid % NDIMB;
const int lb1 = tid / NDIMB;
#pragma unroll
for (int kib=0; kib<KWB; ++kib) {
#pragma unroll
for (int nib=0; nib<NWB/VWN; ++nib) {
// Computes the indices based on strided/non-strided access
#if STRN == 0
int ng = nib + lb0*(NWB/VWN);
#elif STRN == 1
int ng = lb0 + nib*NDIMB;
#endif
// Computes the indices for the global memory
int kg = kib + lb1*KWB;
int idn = ng + get_group_id(1)*(NWG/VWN);
int idk = kg + kwg;
// Loads the data from global memory (transposed) into the local memory
blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
}
}
}
#endif
// =================================================================================================
// Caches global off-chip memory directly into per-thread private memory (registers). This function
// is specific for caching the A input matrix.
#if SA == 0
inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
const int kSizeM, const int idk, const int kwg) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
// Computes the indices based on strided/non-strided access
#if STRM == 0
int mg = mi + get_local_id(0)*(MWI/VWM);
#elif STRM == 1
int mg = get_local_id(0) + mi*MDIMC;
#endif
// Computes the indices for the global memory
int idm = mg + get_group_id(0)*(MWG/VWM);
// Loads the data from global memory (not transposed) and stores into registers
apm[mi] = agm[idk*(kSizeM/VWM) + idm];
}
}
#endif
// Same as above, but now for the B input matrix
#if SB == 0
inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
const int kSizeN, const int idk) {
#pragma unroll
for (int ni=0; ni<NWI/VWN; ++ni) {
// Computes the indices based on strided/non-strided access
#if STRN == 0
int ng = ni + get_local_id(1)*(NWI/VWN);
#elif STRN == 1
int ng = get_local_id(1) + ni*NDIMC;
#endif
// Computes the indices for the global memory
int idn = ng + get_group_id(1)*(NWG/VWN);
// Loads the data from global memory (transposed) and stores into registers
bpm[ni] = bgm[idk*(kSizeN/VWN) + idn];
}
}
#endif
// =================================================================================================
// Caches on-chip local memory into per-thread private memory (registers). This function is specific
// for caching the A input matrix.
#if SA == 1
inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
#pragma unroll
for (int mi=0; mi<MWI/VWM; ++mi) {
#if STRM == 0
int mg = mi + get_local_id(0)*(MWI/VWM);
#elif STRM == 1
int mg = get_local_id(0) + mi*MDIMC;
#endif
apm[mi] = alm[kg*(MWG/VWM) + mg];
}
}
#endif
// Same as above, but now for the B input matrix
#if SB == 1
inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
#pragma unroll
for (int ni=0; ni<NWI/VWN; ++ni) {
#if STRN == 0
int ng = ni + get_local_id(1)*(NWI/VWN);
#elif STRN == 1
int ng = get_local_id(1) + ni*NDIMC;
#endif
bpm[ni] = blm[kg*(NWG/VWN) + ng];
}
}
#endif
// =================================================================================================
// The vectorised multiply-add function
inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
#if USE_VECTOR_MAD == 1

View file

@ -14,7 +14,6 @@
#include "internal/routines/level1/xdotu.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================

View file

@ -33,6 +33,7 @@ Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
source_string_ =
#include "../../kernels/level2/xgemv.opencl"
#include "../../kernels/level2/xgemv_fast.opencl"
;
}

112
src/routines/level2/xger.cc Normal file
View file

@ -0,0 +1,112 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xger class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xger.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xger<float>::precision_ = Precision::kSingle;
template <> const Precision Xger<double>::precision_ = Precision::kDouble;
template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xger<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xger<T>::Xger(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Xger"}, precision_) {
source_string_ =
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xger.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xger<T>::DoGer(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Makes sure all dimensions are larger than zero
if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
// Computes whether or not the matrix has an alternative layout (row or column-major).
const auto a_is_rowmajor = (layout == Layout::kRowMajor);
const auto a_one = (a_is_rowmajor) ? n : m;
const auto a_two = (a_is_rowmajor) ? m : n;
// Tests the matrix and the vectors for validity
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestVectorX(m, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
// Retrieves the Xgemv kernel from the compiled binary
try {
auto& program = GetProgramFromCache();
auto kernel = Kernel(program, "Xger");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(a_one));
kernel.SetArgument(1, static_cast<int>(a_two));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, x_buffer());
kernel.SetArgument(4, static_cast<int>(x_offset));
kernel.SetArgument(5, static_cast<int>(x_inc));
kernel.SetArgument(6, y_buffer());
kernel.SetArgument(7, static_cast<int>(y_offset));
kernel.SetArgument(8, static_cast<int>(y_inc));
kernel.SetArgument(9, a_buffer());
kernel.SetArgument(10, static_cast<int>(a_offset));
kernel.SetArgument(11, static_cast<int>(a_ld));
kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
// Launches the kernel
auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
// Compiles the templated class
template class Xger<float>;
template class Xger<double>;
template class Xger<float2>;
template class Xger<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,53 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xgerc class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xgerc.h"
#include <string>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xgerc<T>::Xgerc(Queue &queue, Event &event, const std::string &name):
Xger<T>(queue, event, name) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xgerc<T>::DoGerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Regular Ger operation on complex data, plus conjugation in the kernel guarded by the
// ROUTINE_GERC guard.
return DoGer(layout, m, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
}
// =================================================================================================
// Compiles the templated class
template class Xgerc<float2>;
template class Xgerc<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,52 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xgeru class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xgeru.h"
#include <string>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xgeru<T>::Xgeru(Queue &queue, Event &event, const std::string &name):
Xger<T>(queue, event, name) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xgeru<T>::DoGeru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Regular Ger operation on complex data
return DoGer(layout, m, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
}
// =================================================================================================
// Compiles the templated class
template class Xgeru<float2>;
template class Xgeru<double2>;
// =================================================================================================
} // namespace clblast

122
src/routines/level2/xher.cc Normal file
View file

@ -0,0 +1,122 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xher class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xher.h"
#include <string>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
template <> const Precision Xher<double2, double>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher<T,U>::Xher(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Xger"}, precision_) {
source_string_ =
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher.opencl"
;
}
// =================================================================================================
// Specializations to compute alpha of type 'T'
template <> float2 Xher<float2,float>::GetAlpha(const float alpha) { return float2{alpha, 0.0f}; }
template <> double2 Xher<double2,double>::GetAlpha(const double alpha) { return double2{alpha, 0.0}; }
template <> float Xher<float,float>::GetAlpha(const float alpha) { return alpha; }
template <> double Xher<double,double>::GetAlpha(const double alpha) { return alpha; }
// =================================================================================================
// The main routine
template <typename T, typename U>
StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed) {
// Makes sure the dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Creates a matching version of alpha
const auto matching_alpha = GetAlpha(alpha);
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
// If alpha is zero an update is not required
if (alpha == U{0}) { return StatusCode::kSuccess; }
// Retrieves the Xgemv kernel from the compiled binary
try {
auto& program = GetProgramFromCache();
auto kernel = Kernel(program, "Xher");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, matching_alpha);
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, static_cast<int>(is_upper));
kernel.SetArgument(9, static_cast<int>(is_rowmajor));
// Launches the kernel
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
// Compiles the templated class
template class Xher<float, float>;
template class Xher<double, double>;
template class Xher<float2, float>;
template class Xher<double2, double>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,114 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xher2 class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xher2.h"
#include <string>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xher2<T>::Xher2(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Xger"}, precision_) {
source_string_ =
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher2.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed) {
// Makes sure the dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
// Retrieves the Xgemv kernel from the compiled binary
try {
auto& program = GetProgramFromCache();
auto kernel = Kernel(program, "Xher2");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
kernel.SetArgument(8, a_buffer());
kernel.SetArgument(9, static_cast<int>(a_offset));
kernel.SetArgument(10, static_cast<int>(a_ld));
kernel.SetArgument(11, static_cast<int>(is_upper));
kernel.SetArgument(12, static_cast<int>(is_rowmajor));
// Launches the kernel
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, global, local);
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
// Compiles the templated class
template class Xher2<float>;
template class Xher2<double>;
template class Xher2<float2>;
template class Xher2<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,51 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xhpr class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xhpr.h"
#include <string>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xhpr<T,U>::Xhpr(Queue &queue, Event &event, const std::string &name):
Xher<T,U>(queue, event, name) {
}
// =================================================================================================
// The main routine
template <typename T, typename U>
StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xhpr functionality is implemented in the kernel using defines
return DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================
// Compiles the templated class
template class Xhpr<float2, float>;
template class Xhpr<double2, double>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,53 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xhpr2 class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xhpr2.h"
#include <string>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xhpr2<T>::Xhpr2(Queue &queue, Event &event, const std::string &name):
Xher2<T>(queue, event, name) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xhpr2 functionality is implemented in the kernel using defines
return DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================
// Compiles the templated class
template class Xhpr2<float2>;
template class Xhpr2<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,51 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xspr class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xspr.h"
#include <string>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xspr<T>::Xspr(Queue &queue, Event &event, const std::string &name):
Xher<T,T>(queue, event, name) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xspr functionality is implemented in the kernel using defines
return DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================
// Compiles the templated class
template class Xspr<float>;
template class Xspr<double>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,53 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xspr2 class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xspr2.h"
#include <string>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xspr2<T>::Xspr2(Queue &queue, Event &event, const std::string &name):
Xher2<T>(queue, event, name) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xspr2 functionality is implemented in the kernel using defines
return DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================
// Compiles the templated class
template class Xspr2<float>;
template class Xspr2<double>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,50 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyr class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xsyr.h"
#include <string>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xsyr<T>::Xsyr(Queue &queue, Event &event, const std::string &name):
Xher<T,T>(queue, event, name) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Specific Xsyr functionality is implemented in the kernel using defines
return DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
a_buffer, a_offset, a_ld);
}
// =================================================================================================
// Compiles the templated class
template class Xsyr<float>;
template class Xsyr<double>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,52 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsyr2 class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level2/xsyr2.h"
#include <string>
namespace clblast {
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xsyr2<T>::Xsyr2(Queue &queue, Event &event, const std::string &name):
Xher2<T>(queue, event, name) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Specific Xsyr2 functionality is implemented in the kernel using defines
return DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
}
// =================================================================================================
// Compiles the templated class
template class Xsyr2<float>;
template class Xsyr2<double>;
// =================================================================================================
} // namespace clblast

View file

@ -30,13 +30,14 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
Xgemm<T>::Xgemm(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
#include "../../kernels/level3/pad.opencl"
#include "../../kernels/level3/transpose.opencl"
#include "../../kernels/level3/padtranspose.opencl"
#include "../../kernels/level3/xgemm.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
;
}

View file

@ -28,13 +28,14 @@ template <> const Precision Xher2k<double2,double>::precision_ = Precision::kCom
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher2k<T,U>::Xher2k(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
#include "../../kernels/level3/pad.opencl"
#include "../../kernels/level3/transpose.opencl"
#include "../../kernels/level3/padtranspose.opencl"
#include "../../kernels/level3/xgemm.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
;
}

View file

@ -28,13 +28,14 @@ template <> const Precision Xherk<double2,double>::precision_ = Precision::kComp
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xherk<T,U>::Xherk(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
#include "../../kernels/level3/pad.opencl"
#include "../../kernels/level3/transpose.opencl"
#include "../../kernels/level3/padtranspose.opencl"
#include "../../kernels/level3/xgemm.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
;
}

View file

@ -30,13 +30,14 @@ template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDou
// Constructor: forwards to base class constructor
template <typename T>
Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
#include "../../kernels/level3/pad.opencl"
#include "../../kernels/level3/transpose.opencl"
#include "../../kernels/level3/padtranspose.opencl"
#include "../../kernels/level3/xgemm.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
;
}

View file

@ -30,13 +30,14 @@ template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
Xsyrk<T>::Xsyrk(Queue &queue, Event &event, const std::string &name):
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/level3/copy.opencl"
#include "../../kernels/level3/pad.opencl"
#include "../../kernels/level3/transpose.opencl"
#include "../../kernels/level3/padtranspose.opencl"
#include "../../kernels/level3/xgemm.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
;
}

View file

@ -31,7 +31,8 @@ class TuneXgemm {
static std::string GetSources() {
return
#include "../src/kernels/common.opencl"
#include "../src/kernels/level3/xgemm.opencl"
#include "../src/kernels/level3/xgemm_part1.opencl"
#include "../src/kernels/level3/xgemm_part2.opencl"
;
}

View file

@ -35,6 +35,7 @@ class TuneXgemv {
return
#include "../src/kernels/common.opencl"
#include "../src/kernels/level2/xgemv.opencl"
#include "../src/kernels/level2/xgemv_fast.opencl"
;
}
@ -60,8 +61,8 @@ class TuneXgemv {
// Sets the tuning parameters and their possible values
static void SetParameters(cltune::Tuner &tuner, const size_t id) {
tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256, 512, 1024, 1536, 2048});
tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8});
tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256});
tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
}
@ -72,7 +73,10 @@ class TuneXgemv {
tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
}
}
static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
}
// Sets the base thread configuration
static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m}; }
@ -108,6 +112,9 @@ class TuneXgemv {
tuner.AddArgumentScalar(0);
tuner.AddArgumentScalar(1);
tuner.AddArgumentScalar(0); // Conjugate transpose
tuner.AddArgumentScalar(0); // Additional parameter
tuner.AddArgumentScalar(0); // Banded 'kl'
tuner.AddArgumentScalar(0); // Banded 'ku'
}
// Describes how to compute the performance metrics

129
src/tuning/xger.cc Normal file
View file

@ -0,0 +1,129 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels.
//
// =================================================================================================
#include <string>
#include <vector>
#include "internal/utilities.h"
#include "internal/tuning.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class TuneXger {
public:
// The representative kernel and the source code
static std::string KernelFamily() { return "xger"; }
static std::string KernelName() { return "Xger"; }
static std::string GetSources() {
return
#include "../src/kernels/common.opencl"
#include "../src/kernels/level2/level2.opencl"
#include "../src/kernels/level2/xger.opencl"
;
}
// The list of arguments relevant for this routine
static std::vector<std::string> GetOptions() { return {kArgN, kArgM, kArgAlpha}; }
// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &) { }
// Sets the default values for the arguments
static size_t DefaultM() { return 1024; }
static size_t DefaultN() { return 1024; }
static size_t DefaultK() { return 1; } // N/A for this kernel
static double DefaultFraction() { return 1.0; } // N/A for this kernel
// Describes how to obtain the sizes of the buffers
static size_t GetSizeX(const Arguments<T> &args) { return args.m; }
static size_t GetSizeY(const Arguments<T> &args) { return args.n; }
static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; }
static size_t GetSizeB(const Arguments<T> &) { return 1; } // N/A for this kernel
static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel
static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
// Sets the tuning parameters and their possible values
static void SetParameters(cltune::Tuner &tuner, const size_t id) {
tuner.AddParameter(id, "WGS1", {4, 8, 16, 32, 64, 128, 256, 512});
tuner.AddParameter(id, "WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256});
tuner.AddParameter(id, "WPT", {1, 2, 4});
}
// Sets the constraints and local memory size
static void SetConstraints(cltune::Tuner &, const size_t) { }
static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { }
// Sets the base thread configuration
static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
static std::vector<size_t> LocalSize() { return {1, 1}; }
static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
// Transforms the thread configuration based on the parameters
using TransformVector = std::vector<std::vector<std::string>>;
static TransformVector MulLocal() { return {{"WGS1", "WGS2"}}; }
static TransformVector DivLocal() { return {}; }
static TransformVector MulGlobal() { return {}; }
static TransformVector DivGlobal() { return {{"WPT", "WPT"}}; }
// Sets the kernel's arguments
static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
std::vector<T> &x_vec, std::vector<T> &y_vec,
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
std::vector<T> &) {
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(args.alpha);
tuner.AddArgumentInput(x_vec);
tuner.AddArgumentScalar(0); // x_offset
tuner.AddArgumentScalar(1); // x_increment
tuner.AddArgumentInput(y_vec);
tuner.AddArgumentScalar(0); // y_offset
tuner.AddArgumentScalar(1); // y_increment
tuner.AddArgumentOutput(a_mat);
tuner.AddArgumentScalar(0); // a_offset
tuner.AddArgumentScalar(static_cast<int>(args.m)); // a_ld
tuner.AddArgumentScalar(0); // a_is_rowmajor
}
// Describes how to compute the performance metrics
static size_t GetMetric(const Arguments<T> &args) {
return (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision);
}
static std::string PerformanceUnit() { return "GB/s"; }
};
// =================================================================================================
} // namespace clblast
// Shortcuts to the clblast namespace
using float2 = clblast::float2;
using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXger<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXger<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXger<float2>, float2>(argc, argv); break;
case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXger<double2>, double2>(argc, argv); break;
}
return 0;
}
// =================================================================================================

View file

@ -103,7 +103,13 @@ std::string ToString(Precision value) {
// both the real and imaginary parts.
template <typename T>
T ConvertArgument(const char* value) {
return static_cast<T>(std::stod(value));
return static_cast<T>(std::stoi(value));
}
template <> float ConvertArgument(const char* value) {
return static_cast<float>(std::stod(value));
}
template <> double ConvertArgument(const char* value) {
return static_cast<double>(std::stod(value));
}
template <> float2 ConvertArgument(const char* value) {
auto val = static_cast<float>(std::stod(value));
@ -139,7 +145,6 @@ T GetArgument(const int argc, char *argv[], std::string &help,
}
// Compiles the above function
template bool GetArgument<bool>(const int, char **, std::string&, const std::string&, const bool);
template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int);
template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t);
template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float);
@ -156,9 +161,9 @@ template Precision GetArgument<Precision>(const int, char **, std::string&, cons
// =================================================================================================
// Returns only the precision argument
Precision GetPrecision(const int argc, char *argv[]) {
Precision GetPrecision(const int argc, char *argv[], const Precision default_precision) {
auto dummy = std::string{};
return GetArgument(argc, argv, dummy, kArgPrecision, Precision::kSingle);
return GetArgument(argc, argv, dummy, kArgPrecision, default_precision);
}
// =================================================================================================

View file

@ -35,7 +35,7 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
const Routine run_routine, const Routine run_reference,
const ResultGet get_result, const ResultIndex get_index,
const ResultIterator get_id1, const ResultIterator get_id2):
Tester<T,U>{argc, argv, silent, name, options},
Tester<T,U>(argc, argv, silent, name, options),
run_routine_(run_routine),
run_reference_(run_reference),
get_result_(get_result),

View file

@ -80,11 +80,11 @@ template <typename T, typename U>
Tester<T,U>::~Tester() {
if (PrecisionSupported<T>(device_)) {
fprintf(stdout, "* Completed all test-cases for this routine. Results:\n");
fprintf(stdout, " %lu test(s) passed\n", tests_passed_);
fprintf(stdout, " %zu test(s) passed\n", tests_passed_);
if (tests_skipped_ > 0) { fprintf(stdout, "%s", kPrintWarning.c_str()); }
fprintf(stdout, " %lu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str());
fprintf(stdout, " %zu test(s) skipped%s\n", tests_skipped_, kPrintEnd.c_str());
if (tests_failed_ > 0) { fprintf(stdout, "%s", kPrintError.c_str()); }
fprintf(stdout, " %lu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
fprintf(stdout, " %zu test(s) failed%s\n", tests_failed_, kPrintEnd.c_str());
}
fprintf(stdout, "\n");
clblasTeardown();
@ -129,29 +129,29 @@ void Tester<T,U>::TestEnd() {
fprintf(stdout, " Status code %d (expected %d): ", entry.status_found, entry.status_expect);
}
for (auto &o: options_) {
if (o == kArgM) { fprintf(stdout, "%s=%lu ", kArgM, entry.args.m); }
if (o == kArgN) { fprintf(stdout, "%s=%lu ", kArgN, entry.args.n); }
if (o == kArgK) { fprintf(stdout, "%s=%lu ", kArgK, entry.args.k); }
if (o == kArgKU) { fprintf(stdout, "%s=%lu ", kArgKU, entry.args.ku); }
if (o == kArgKL) { fprintf(stdout, "%s=%lu ", kArgKL, entry.args.kl); }
if (o == kArgM) { fprintf(stdout, "%s=%zu ", kArgM, entry.args.m); }
if (o == kArgN) { fprintf(stdout, "%s=%zu ", kArgN, entry.args.n); }
if (o == kArgK) { fprintf(stdout, "%s=%zu ", kArgK, entry.args.k); }
if (o == kArgKU) { fprintf(stdout, "%s=%zu ", kArgKU, entry.args.ku); }
if (o == kArgKL) { fprintf(stdout, "%s=%zu ", kArgKL, entry.args.kl); }
if (o == kArgLayout) { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
if (o == kArgATransp) { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
if (o == kArgBTransp) { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
if (o == kArgSide) { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
if (o == kArgXInc) { fprintf(stdout, "%s=%lu ", kArgXInc, entry.args.x_inc);}
if (o == kArgYInc) { fprintf(stdout, "%s=%lu ", kArgYInc, entry.args.y_inc);}
if (o == kArgXOffset) { fprintf(stdout, "%s=%lu ", kArgXOffset, entry.args.x_offset);}
if (o == kArgYOffset) { fprintf(stdout, "%s=%lu ", kArgYOffset, entry.args.y_offset);}
if (o == kArgALeadDim) { fprintf(stdout, "%s=%lu ", kArgALeadDim, entry.args.a_ld);}
if (o == kArgBLeadDim) { fprintf(stdout, "%s=%lu ", kArgBLeadDim, entry.args.b_ld);}
if (o == kArgCLeadDim) { fprintf(stdout, "%s=%lu ", kArgCLeadDim, entry.args.c_ld);}
if (o == kArgAOffset) { fprintf(stdout, "%s=%lu ", kArgAOffset, entry.args.a_offset);}
if (o == kArgBOffset) { fprintf(stdout, "%s=%lu ", kArgBOffset, entry.args.b_offset);}
if (o == kArgCOffset) { fprintf(stdout, "%s=%lu ", kArgCOffset, entry.args.c_offset);}
if (o == kArgAPOffset) { fprintf(stdout, "%s=%lu ", kArgAPOffset, entry.args.ap_offset);}
if (o == kArgDotOffset){ fprintf(stdout, "%s=%lu ", kArgDotOffset, entry.args.dot_offset);}
if (o == kArgXInc) { fprintf(stdout, "%s=%zu ", kArgXInc, entry.args.x_inc);}
if (o == kArgYInc) { fprintf(stdout, "%s=%zu ", kArgYInc, entry.args.y_inc);}
if (o == kArgXOffset) { fprintf(stdout, "%s=%zu ", kArgXOffset, entry.args.x_offset);}
if (o == kArgYOffset) { fprintf(stdout, "%s=%zu ", kArgYOffset, entry.args.y_offset);}
if (o == kArgALeadDim) { fprintf(stdout, "%s=%zu ", kArgALeadDim, entry.args.a_ld);}
if (o == kArgBLeadDim) { fprintf(stdout, "%s=%zu ", kArgBLeadDim, entry.args.b_ld);}
if (o == kArgCLeadDim) { fprintf(stdout, "%s=%zu ", kArgCLeadDim, entry.args.c_ld);}
if (o == kArgAOffset) { fprintf(stdout, "%s=%zu ", kArgAOffset, entry.args.a_offset);}
if (o == kArgBOffset) { fprintf(stdout, "%s=%zu ", kArgBOffset, entry.args.b_offset);}
if (o == kArgCOffset) { fprintf(stdout, "%s=%zu ", kArgCOffset, entry.args.c_offset);}
if (o == kArgAPOffset) { fprintf(stdout, "%s=%zu ", kArgAPOffset, entry.args.ap_offset);}
if (o == kArgDotOffset){ fprintf(stdout, "%s=%zu ", kArgDotOffset, entry.args.dot_offset);}
}
fprintf(stdout, "\n");
}
@ -159,18 +159,18 @@ void Tester<T,U>::TestEnd() {
// Prints a test summary
auto pass_rate = 100*num_passed_ / static_cast<float>(num_passed_ + num_skipped_ + num_failed_);
fprintf(stdout, " Pass rate %s%5.1lf%%%s:", kPrintMessage.c_str(), pass_rate, kPrintEnd.c_str());
fprintf(stdout, " %lu passed /", num_passed_);
fprintf(stdout, " %zu passed /", num_passed_);
if (num_skipped_ != 0) {
fprintf(stdout, " %s%lu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str());
fprintf(stdout, " %s%zu skipped%s /", kPrintWarning.c_str(), num_skipped_, kPrintEnd.c_str());
}
else {
fprintf(stdout, " %lu skipped /", num_skipped_);
fprintf(stdout, " %zu skipped /", num_skipped_);
}
if (num_failed_ != 0) {
fprintf(stdout, " %s%lu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str());
fprintf(stdout, " %s%zu failed%s\n", kPrintError.c_str(), num_failed_, kPrintEnd.c_str());
}
else {
fprintf(stdout, " %lu failed\n", num_failed_);
fprintf(stdout, " %zu failed\n", num_failed_);
}
}
@ -280,21 +280,21 @@ bool TestSimilarity(const T val1, const T val2) {
const auto difference = std::fabs(val1 - val2);
// Set the allowed error margin for floating-point comparisons
constexpr auto kErrorMarginRelative = 1.0e-2;
constexpr auto kErrorMarginAbsolute = 1.0e-10;
constexpr auto kErrorMarginRelative = T{0.025};
constexpr auto kErrorMarginAbsolute = T{1.0e-6};
// Shortcut, handles infinities
if (val1 == val2) {
return true;
}
// The values are zero or very small: the relative error is less meaningful
else if (val1 == 0 || val2 == 0 || difference < static_cast<T>(kErrorMarginAbsolute)) {
return (difference < static_cast<T>(kErrorMarginAbsolute));
else if (val1 == 0 || val2 == 0 || difference < kErrorMarginAbsolute) {
return (difference < kErrorMarginAbsolute);
}
// Use relative error
else {
const auto absolute_sum = std::fabs(val1) + std::fabs(val2);
return (difference / absolute_sum) < static_cast<T>(kErrorMarginRelative);
return (difference / absolute_sum) < kErrorMarginRelative;
}
}

View file

@ -15,6 +15,7 @@
#include <string>
#include <vector>
#include <utility>
#include <algorithm>
#include <chrono>
@ -48,11 +49,11 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
for (auto &o: options_) {
// Data-sizes
if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, 512UL); }
if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, 512UL); }
if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, 512UL); }
if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, 128UL); }
if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, 128UL); }
if (o == kArgM) { args.m = GetArgument(argc, argv, help, kArgM, size_t{512}); }
if (o == kArgN) { args.n = GetArgument(argc, argv, help, kArgN, size_t{512}); }
if (o == kArgK) { args.k = GetArgument(argc, argv, help, kArgK, size_t{512}); }
if (o == kArgKU) { args.ku = GetArgument(argc, argv, help, kArgKU, size_t{128}); }
if (o == kArgKL) { args.kl = GetArgument(argc, argv, help, kArgKL, size_t{128}); }
// Data-layouts
if (o == kArgLayout) { args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kRowMajor); }
@ -89,7 +90,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
args.device_id = GetArgument(argc, argv, help, kArgDevice, size_t{0});
args.precision = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, true);
args.compare_clblas = GetArgument(argc, argv, help, kArgCompareclblas, 1);
args.step = GetArgument(argc, argv, help, kArgStepSize, size_t{1});
args.num_steps = GetArgument(argc, argv, help, kArgNumSteps, size_t{0});
args.num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{10});
@ -112,7 +113,7 @@ template <typename T, typename U>
void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {
// Prints the header of the output table
PrintTableHeader(args.silent, options_);
PrintTableHeader(args);
// Initializes OpenCL and the libraries
auto platform = Platform(args.platform_id);
@ -162,11 +163,16 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes)
auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, dot};
// Runs the routines and collects the timings
auto timings = std::vector<std::pair<std::string, double>>();
auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
if (args.compare_clblas) {
auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference_, "clBLAS");
timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
}
// Prints the performance of both libraries
PrintTableRow(args, ms_clblast, ms_clblas);
// Prints the performance of the tested libraries
PrintTableRow(args, timings);
// Makes the jump to the next step
++s;
@ -213,20 +219,27 @@ double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &ar
// Prints the header of the performance table
template <typename T, typename U>
void Client<T,U>::PrintTableHeader(const bool silent, const std::vector<std::string> &args) {
if (!silent) {
for (auto i=size_t{0}; i<args.size(); ++i) { fprintf(stdout, "%9s ", ""); }
fprintf(stdout, " | <-- CLBlast --> | <-- clBLAS --> |\n");
void Client<T,U>::PrintTableHeader(const Arguments<U>& args) {
// First line (optional)
if (!args.silent) {
for (auto i=size_t{0}; i<options_.size(); ++i) { fprintf(stdout, "%9s ", ""); }
fprintf(stdout, " | <-- CLBlast -->");
if (args.compare_clblas) { fprintf(stdout, " | <-- clBLAS -->"); }
fprintf(stdout, " |\n");
}
for (auto &argument: args) { fprintf(stdout, "%9s;", argument.c_str()); }
fprintf(stdout, "%9s;%9s;%9s;%9s;%9s;%9s\n",
"ms_1", "GFLOPS_1", "GBs_1", "ms_2", "GFLOPS_2", "GBs_2");
// Second line
for (auto &option: options_) { fprintf(stdout, "%9s;", option.c_str()); }
fprintf(stdout, "%9s;%9s;%9s", "ms_1", "GFLOPS_1", "GBs_1");
if (args.compare_clblas) { fprintf(stdout, ";%9s;%9s;%9s", "ms_2", "GFLOPS_2", "GBs_2"); }
fprintf(stdout, "\n");
}
// Print a performance-result row
template <typename T, typename U>
void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblast,
const double ms_clblas) {
void Client<T,U>::PrintTableRow(const Arguments<U>& args,
const std::vector<std::pair<std::string, double>>& timings) {
// Creates a vector of relevant variables
auto integers = std::vector<size_t>{};
@ -261,34 +274,36 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, const double ms_clblas
else if (o == kArgBeta) { strings.push_back(ToString(args.beta)); }
}
// Computes the GFLOPS and GB/s metrics
auto flops = get_flops_(args);
auto bytes = get_bytes_(args);
auto gflops_clblast = (ms_clblast != 0.0) ? (flops*1e-6)/ms_clblast : 0;
auto gflops_clblas = (ms_clblas != 0.0) ? (flops*1e-6)/ms_clblas: 0;
auto gbs_clblast = (ms_clblast != 0.0) ? (bytes*1e-6)/ms_clblast : 0;
auto gbs_clblas = (ms_clblas != 0.0) ? (bytes*1e-6)/ms_clblas: 0;
// Outputs the argument values
for (auto &argument: integers) {
if (!args.no_abbrv && argument >= 1024*1024 && IsMultiple(argument, 1024*1024)) {
fprintf(stdout, "%8luM;", argument/(1024*1024));
fprintf(stdout, "%8zuM;", argument/(1024*1024));
}
else if (!args.no_abbrv && argument >= 1024 && IsMultiple(argument, 1024)) {
fprintf(stdout, "%8luK;", argument/1024);
fprintf(stdout, "%8zuK;", argument/1024);
}
else {
fprintf(stdout, "%9lu;", argument);
fprintf(stdout, "%9zu;", argument);
}
}
for (auto &argument: strings) {
fprintf(stdout, "%9s;", argument.c_str());
}
// Outputs the performance numbers
fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf;%9.2lf;%9.1lf;%9.1lf\n",
ms_clblast, gflops_clblast, gbs_clblast,
ms_clblas, gflops_clblas, gbs_clblas);
// Loops over all tested libraries
for (const auto& timing : timings) {
// Computes the GFLOPS and GB/s metrics
auto flops = get_flops_(args);
auto bytes = get_bytes_(args);
auto gflops = (timing.second != 0.0) ? (flops*1e-6)/timing.second : 0;
auto gbs = (timing.second != 0.0) ? (bytes*1e-6)/timing.second : 0;
// Outputs the performance numbers
if (timing.first != "CLBlast") { fprintf(stdout, ";"); }
fprintf(stdout, "%9.2lf;%9.1lf;%9.1lf", timing.second, gflops, gbs);
}
fprintf(stdout, "\n");
}
// =================================================================================================

View file

@ -23,6 +23,7 @@
#include <string>
#include <vector>
#include <utility>
// The libraries to test
#include <clBLAS.h>
@ -64,10 +65,11 @@ class Client {
Queue &queue, Routine run_blas, const std::string &library_name);
// Prints the header of a performance-data table
void PrintTableHeader(const bool silent, const std::vector<std::string> &args);
void PrintTableHeader(const Arguments<U>& args);
// Prints a row of performance data, including results of two libraries
void PrintTableRow(const Arguments<U>& args, const double ms_clblast, const double ms_clblas);
void PrintTableRow(const Arguments<U>& args,
const std::vector<std::pair<std::string, double>>& timings);
// The routine-specific functions passed to the tester
const Routine run_routine_;

View file

@ -63,7 +63,7 @@ main <- function(routine_name, precision, test_names, test_values,
if (precision == 64) { display_name <- gsub("^X","D",display_name); }
if (precision == 3232) { display_name <- gsub("^X","C",display_name); }
if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
executable <- paste("./client_", routine_name, sep="")
executable <- paste("./clblast_client_", routine_name, sep="")
# Configures the outputfile
pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break;

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXcopy<float>, float, float>(argc, argv); break;

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXdot<float>, float, float>(argc, argv); break;

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXscal<float>, float, float>(argc, argv); break;

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXswap<float>, float, float>(argc, argv); break;

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXgbmv<float>, float, float>(argc, argv); break;

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break;

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle:
clblast::RunClient<clblast::TestXger<float>, float, float>(argc, argv); break;

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");

View file

@ -18,7 +18,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
switch(clblast::GetPrecision(argc, argv, clblast::Precision::kComplexSingle)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kSingle: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kDouble: throw std::runtime_error("Unsupported precision mode");

Some files were not shown because too many files have changed in this diff Show more