Merge pull request #58 from CNugteren/development

Update to version 0.7.0
This commit is contained in:
Cedric Nugteren 2016-05-08 21:25:50 +02:00
commit d91356a6b7
187 changed files with 12794 additions and 3069 deletions

3
.gitignore vendored
View file

@ -2,4 +2,5 @@ build
stash
.*
*.pyc
*.db
*.db
cl.hpp

View file

@ -1,4 +1,23 @@
Version 0.7.0
- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter)
- Made the library thread-safe
- Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries
- Fixed the use of events within the library
- Changed the enum parameters to match the raw values of the cblas standard
- Fixed the cache of previously compiled binaries and added a function to fill or clear it
- Various minor fixes and enhancements
- Added a preliminary version of the API documentation
- Added additional sample programs
- Added tuned parameters for various devices (see README)
- Added level-1 routines:
* SNRM2/DNRM2/ScNRM2/DzNRM2
* SASUM/DASUM/ScASUM/DzASUM
* SSUM/DSUM/ScSUM/DzSUM (non-absolute version of the above xASUM BLAS routines)
* iSAMAX/iDAMAX/iCAMAX/iZAMAX
* iSMAX/iDMAX/iCMAX/iZMAX (non-absolute version of the above ixAMAX BLAS routines)
* iSMIN/iDMIN/iCMIN/iZMIN (non-absolute minimum version of the above ixAMAX BLAS routines)
Version 0.6.0
- Added support for MSVC (Visual Studio) 2015
- Added tuned parameters for various devices (see README)

View file

@ -13,7 +13,7 @@
cmake_minimum_required(VERSION 2.8.10)
project("clblast" C CXX)
set(clblast_VERSION_MAJOR 0)
set(clblast_VERSION_MINOR 6)
set(clblast_VERSION_MINOR 7)
set(clblast_VERSION_PATCH 0)
# Options and their default values
@ -66,13 +66,22 @@ else ()
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
endif()
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
set(FLAGS "${FLAGS} -Wno-deprecated-declarations")
endif()
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
# C compiler settings (for the sample)
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
set(CFLAGS "/Ox")
else ()
set(CFLAGS "-O3 -std=c99")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}")
# ==================================================================================================
# Package scripts location
@ -90,11 +99,13 @@ if(TUNERS)
endif()
endif()
# Locates the clBLAS library in case the tests need to be compiled. "FindclBLAS.cmake" is included.
# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
# and "FindCBLAS.cmake" are included.
if(TESTS)
find_package(clBLAS)
if(NOT CLBLAS_FOUND)
message(STATUS "Could NOT find clBLAS, disabling the compilation of the tests")
find_package(CBLAS)
if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
set(TESTS OFF)
endif()
endif()
@ -109,8 +120,8 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sgemm)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm cache)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
@ -120,7 +131,8 @@ set(PRECISIONS 32 64 3232 6464)
# ==================================================================================================
# Gathers all source-files
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc src/clblast_c.cc)
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/cache.cc
src/utilities.cc src/clblast_c.cc)
foreach(ROUTINE ${LEVEL1_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
endforeach()
@ -156,6 +168,9 @@ endif()
# This section contains all the code related to the examples
if(SAMPLES)
# Downloads the cl.hpp file from Khronos
file(DOWNLOAD https://www.khronos.org/registry/cl/api/1.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
# Adds sample programs (C++)
foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc)
@ -204,11 +219,33 @@ endif()
# ==================================================================================================
# Down from here is all test (performance and correctness) related. Note that these tests require
# the presence of the clBLAS library to act as a reference.
# the presence of clBLAS and/or a BLAS library to act as a reference.
if(TESTS)
# Adds new include directories for the reference clBLAS
include_directories(${clblast_SOURCE_DIR}/test ${CLBLAS_INCLUDE_DIRS})
# Sets the specifics for the reference BLAS libraries
set(REF_INCLUDES )
set(REF_LIBRARIES )
if(CLBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
add_definitions(" /DCLBLAST_REF_CLBLAS")
else()
add_definitions(" -DCLBLAST_REF_CLBLAS")
endif()
endif()
if(CBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
add_definitions(" /DCLBLAST_REF_CBLAS")
else()
add_definitions(" -DCLBLAST_REF_CBLAS")
endif()
endif()
# Sets the include directories
include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})
# Creates the common correctness-tests objects (requires CMake 2.8.8)
add_library(test_correctness_common OBJECT
@ -228,7 +265,7 @@ if(TESTS)
test/correctness/routines/level3/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(clblast_test_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
endforeach()
@ -258,7 +295,7 @@ if(TESTS)
test/performance/routines/level3/${ROUTINE}.cc)
endforeach()
foreach(ROUTINE ${ROUTINES})
target_link_libraries(clblast_client_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
endforeach()

155
README.md
View file

@ -52,6 +52,14 @@ The pre-requisites for compilation of CLBlast are:
- Intel OpenCL
- Beignet
Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either:
* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD)
* A regular CPU Netlib BLAS library, e.g.:
- OpenBLAS
- BLIS
- Accelerate
An example of an out-of-source build (starting from the root of the CLBlast folder):
mkdir build
@ -76,7 +84,7 @@ Or alternatively the plain C version:
#include <clblast_c.h>
Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file. Additionally, a couple of stand-alone example programs are included in `samples/`.
Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/api.md). Additionally, a couple of stand-alone example programs are included in `samples/`.
Using the tuners (optional)
@ -95,6 +103,8 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- Tesla K40m
* AMD GPUs:
- Tahiti
- Hawaii
- Pitcairn
- R9 M370X
* Intel GPUs:
- Iris
@ -128,16 +138,16 @@ In summary, tuning the entire library for your device can be done as follows (st
make
Compiling the tests (optional)
Compiling the correctness and performance tests (optional)
-------------
To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled:
cmake -DTESTS=ON ..
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. The library clBLAS is therefore required to be installed on your system for the CLBlast tests.
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against. If both are present, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables.
With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test.
With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library.
Performance remarks
@ -161,64 +171,77 @@ These graphs can be generated automatically on your own device. First, compile C
Supported routines
-------------
CLBlast is in active development but already supports almost all the BLAS routines. The currently supported routines are marked with '✔' in the following tables. Empty boxes represent routines that still need to be implemented in a future release, whereas routines marked with '-' are not part of BLAS at all.
CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all.
| Level-1 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xROTG | | | - | - | |
| xROTMG | | | - | - | |
| xROT | | | - | - | |
| xROTM | | | - | - | |
| xSWAP | ✔ | ✔ | ✔ | ✔ | |
| xSCAL | ✔ | ✔ | ✔ | ✔ | +CS +ZD |
| xCOPY | ✔ | ✔ | ✔ | ✔ | |
| xAXPY | ✔ | ✔ | ✔ | ✔ | |
| xDOT | ✔ | ✔ | - | - | |
| xDOTU | - | - | ✔ | ✔ | |
| xDOTC | - | - | ✔ | ✔ | |
| xNRM2 | | | - | - | +SC +DZ |
| xASUM | | | - | - | +SC +DZ |
| IxAMAX | | | | | |
| Level-1 | S | D | C | Z |
| ---------|---|---|---|---|
| xSWAP | ✔ | ✔ | ✔ | ✔ |
| xSCAL | ✔ | ✔ | ✔ | ✔ |
| xCOPY | ✔ | ✔ | ✔ | ✔ |
| xAXPY | ✔ | ✔ | ✔ | ✔ |
| xDOT | ✔ | ✔ | - | - |
| xDOTU | - | - | ✔ | ✔ |
| xDOTC | - | - | ✔ | ✔ |
| xNRM2 | ✔ | ✔ | ✔ | ✔ |
| xASUM | ✔ | ✔ | ✔ | ✔ |
| IxAMAX | ✔ | ✔ | ✔ | ✔ |
| Level-2 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
| xGBMV | ✔ | ✔ | ✔ | ✔ | |
| xHEMV | - | - | ✔ | ✔ | |
| xHBMV | - | - | ✔ | ✔ | |
| xHPMV | - | - | ✔ | ✔ | |
| xSYMV | ✔ | ✔ | - | - | |
| xSBMV | ✔ | ✔ | - | - | |
| xSPMV | ✔ | ✔ | - | - | |
| xTRMV | ✔ | ✔ | ✔ | ✔ | |
| xTBMV | ✔ | ✔ | ✔ | ✔ | |
| xTPMV | ✔ | ✔ | ✔ | ✔ | |
| xTRSV | | | | | |
| xTBSV | | | | | |
| xTPSV | | | | | |
| xGER | ✔ | ✔ | - | - | |
| xGERU | - | - | ✔ | ✔ | |
| xGERC | - | - | ✔ | ✔ | |
| xHER | - | - | ✔ | ✔ | |
| xHPR | - | - | ✔ | ✔ | |
| xHER2 | - | - | ✔ | ✔ | |
| xHPR2 | - | - | ✔ | ✔ | |
| xSYR | ✔ | ✔ | - | - | |
| xSPR | ✔ | ✔ | - | - | |
| xSYR2 | ✔ | ✔ | - | - | |
| xSPR2 | ✔ | ✔ | - | - | |
| Level-2 | S | D | C | Z |
| ---------|---|---|---|---|
| xGEMV | ✔ | ✔ | ✔ | ✔ |
| xGBMV | ✔ | ✔ | ✔ | ✔ |
| xHEMV | - | - | ✔ | ✔ |
| xHBMV | - | - | ✔ | ✔ |
| xHPMV | - | - | ✔ | ✔ |
| xSYMV | ✔ | ✔ | - | - |
| xSBMV | ✔ | ✔ | - | - |
| xSPMV | ✔ | ✔ | - | - |
| xTRMV | ✔ | ✔ | ✔ | ✔ |
| xTBMV | ✔ | ✔ | ✔ | ✔ |
| xTPMV | ✔ | ✔ | ✔ | ✔ |
| xGER | ✔ | ✔ | - | - |
| xGERU | - | - | ✔ | ✔ |
| xGERC | - | - | ✔ | ✔ |
| xHER | - | - | ✔ | ✔ |
| xHPR | - | - | ✔ | ✔ |
| xHER2 | - | - | ✔ | ✔ |
| xHPR2 | - | - | ✔ | ✔ |
| xSYR | ✔ | ✔ | - | - |
| xSPR | ✔ | ✔ | - | - |
| xSYR2 | ✔ | ✔ | - | - |
| xSPR2 | ✔ | ✔ | - | - |
| Level-3 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMM | ✔ | ✔ | ✔ | ✔ | |
| xSYMM | ✔ | ✔ | ✔ | ✔ | |
| xHEMM | - | - | ✔ | ✔ | |
| xSYRK | ✔ | ✔ | ✔ | ✔ | |
| xHERK | - | - | ✔ | ✔ | |
| xSYR2K | ✔ | ✔ | ✔ | ✔ | |
| xHER2K | - | - | ✔ | ✔ | |
| xTRMM | ✔ | ✔ | ✔ | ✔ | |
| xTRSM | | | | | |
| Level-3 | S | D | C | Z |
| ---------|---|---|---|---|
| xGEMM | ✔ | ✔ | ✔ | ✔ |
| xSYMM | ✔ | ✔ | ✔ | ✔ |
| xHEMM | - | - | ✔ | ✔ |
| xSYRK | ✔ | ✔ | ✔ | ✔ |
| xHERK | - | - | ✔ | ✔ |
| xSYR2K | ✔ | ✔ | ✔ | ✔ |
| xHER2K | - | - | ✔ | ✔ |
| xTRMM | ✔ | ✔ | ✔ | ✔ |
In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care:
| Additional | S | D | C | Z |
| -----------|---|---|---|---|
| xSUM | ✔ | ✔ | ✔ | ✔ |
| IxMAX | ✔ | ✔ | ✔ | ✔ |
| IxMIN | ✔ | ✔ | ✔ | ✔ |
Some BLAS routines are not supported yet by CLBlast. They are shown in the following table:
| Unsupported | S | D | C | Z |
| ------------|---|---|---|---|
| xROTG | | | - | - |
| xROTMG | | | - | - |
| xROT | | | - | - |
| xROTM | | | - | - |
| xTRSV | | | | |
| xTBSV | | | | |
| xTPSV | | | | |
| xTRSM | | | | |
Contributing
@ -226,28 +249,28 @@ Contributing
Contributions are welcome in the form of tuning results for OpenCL devices previously untested. Furthermore, merge requests are welcome as long as they contain unit additions or modifications. Furthermore, they should follow the CLBlast coding style, which is based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers.
The contributing authors so far are:
The contributing authors (code, pull requests, testing) so far are:
* [Cedric Nugteren](http://www.cedricnugteren.nl)
* [Anton Lokhmotov](https://github.com/psyhtest)
* [Dragan Djuric](https://github.com/blueberry)
* [Hugh Perkins](https://github.com/hughperkins)
Tuning and testing on a variety of OpenCL devices was made possible by:
* [TU/e ES research group](http://www.es.ele.tue.nl/)
* [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
* [Dividiti](http://www.dividiti.com)
* [dividiti](http://www.dividiti.com)
* [SURFsara HPC center](http://www.surfsara.com)
Support us
-------------
This project started in March 2015 as an evenings and weekends free-time project next to a full-time job. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).
This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).
To-do list before release of version 1.0
-------------
- Support all routines supported by clBLAS
- Allow the user control over events and synchronization
- Add half-precision routines (e.g. HGEMM)
- Enable correctness and performance testing against a CPU-based BLAS library
- Test in multi-threaded environments
- Add API documentation

View file

@ -0,0 +1,75 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
# width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# ==================================================================================================
#
# Defines the following variables:
# CBLAS_FOUND Boolean holding whether or not the Netlib BLAS library was found
# CBLAS_INCLUDE_DIRS The Netlib BLAS include directory
# CBLAS_LIBRARIES The Netlib BLAS library
#
# In case BLAS is not installed in the default directory, set the CBLAS_ROOT variable to point to
# the root of BLAS, such that 'cblas.h' can be found in $CBLAS_ROOT/include. This can either be
# done using an environmental variable (e.g. export CBLAS_ROOT=/path/to/BLAS) or using a CMake
# variable (e.g. cmake -DCBLAS_ROOT=/path/to/BLAS ..).
#
# ==================================================================================================
# Sets the possible install locations
set(CBLAS_HINTS
${CBLAS_ROOT}
$ENV{CBLAS_ROOT}
)
set(CBLAS_PATHS
/usr
/usr/local
/usr/local/opt
/System/Library/Frameworks
)
# Finds the include directories
find_path(CBLAS_INCLUDE_DIRS
NAMES cblas.h
HINTS ${CBLAS_HINTS}
PATH_SUFFIXES
include inc include/x86_64 include/x64
openblas/include include/blis blis/include blis/include/blis
Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers
PATHS ${CBLAS_PATHS}
DOC "Netlib BLAS include header cblas.h"
)
mark_as_advanced(CBLAS_INCLUDE_DIRS)
# Finds the library
find_library(CBLAS_LIBRARIES
NAMES cblas blas mkl blis openblas accelerate
HINTS ${CBLAS_HINTS}
PATH_SUFFIXES
lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
openblas/lib blis/lib lib/atlas-base
PATHS ${CBLAS_PATHS}
DOC "Netlib BLAS library"
)
mark_as_advanced(CBLAS_LIBRARIES)
# ==================================================================================================
# Notification messages
if(NOT CBLAS_INCLUDE_DIRS)
message(STATUS "Could NOT find 'cblas.h', install a CPU Netlib BLAS or set CBLAS_ROOT")
endif()
if(NOT CBLAS_LIBRARIES)
message(STATUS "Could NOT find a CPU Netlib BLAS library, install it or set CBLAS_ROOT")
endif()
# Determines whether or not BLAS was found
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES)
# ==================================================================================================

2434
doc/clblast.md Normal file

File diff suppressed because it is too large Load diff

View file

@ -73,11 +73,11 @@ enum class StatusCode {
};
// Matrix layout and transpose types
enum class Layout { kRowMajor, kColMajor };
enum class Transpose { kNo, kYes, kConjugate };
enum class Side { kLeft, kRight };
enum class Triangle { kUpper, kLower };
enum class Diagonal { kUnit, kNonUnit };
enum class Layout { kRowMajor = 101, kColMajor = 102 };
enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 };
enum class Triangle { kUpper = 121, kLower = 122 };
enum class Diagonal { kNonUnit = 131, kUnit = 132 };
enum class Side { kLeft = 141, kRight = 142 };
// Precision scoped enum (values in bits)
enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
@ -87,26 +87,60 @@ enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
// BLAS level-1 (vector-vector) routines
// =================================================================================================
// Generate givens plane rotation: SROTG/DROTG
template <typename T>
StatusCode Rotg(cl_mem sa_buffer, const size_t sa_offset,
cl_mem sb_buffer, const size_t sb_offset,
cl_mem sc_buffer, const size_t sc_offset,
cl_mem ss_buffer, const size_t ss_offset,
cl_command_queue* queue, cl_event* event = nullptr);
// Generate modified givens plane rotation: SROTMG/DROTMG
template <typename T>
StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event = nullptr);
// Apply givens plane rotation: SROT/DROT
template <typename T>
StatusCode Rot(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const T cos,
const T sin,
cl_command_queue* queue, cl_event* event = nullptr);
// Apply modified givens plane rotation: SROTM/DROTM
template <typename T>
StatusCode Rotm(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event = nullptr);
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
template <typename T>
StatusCode Swap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
template <typename T>
StatusCode Scal(const size_t n,
const T alpha,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
template <typename T>
StatusCode Copy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
template <typename T>
@ -114,7 +148,7 @@ StatusCode Axpy(const size_t n,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Dot product of two vectors: SDOT/DDOT
template <typename T>
@ -122,7 +156,7 @@ StatusCode Dot(const size_t n,
cl_mem dot_buffer, const size_t dot_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Dot product of two complex vectors: CDOTU/ZDOTU
template <typename T>
@ -130,7 +164,7 @@ StatusCode Dotu(const size_t n,
cl_mem dot_buffer, const size_t dot_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
template <typename T>
@ -138,7 +172,49 @@ StatusCode Dotc(const size_t n,
cl_mem dot_buffer, const size_t dot_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
template <typename T>
StatusCode Nrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
template <typename T>
StatusCode Asum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
template <typename T>
StatusCode Sum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
template <typename T>
StatusCode Amax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
template <typename T>
StatusCode Max(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
template <typename T>
StatusCode Min(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
@ -153,7 +229,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
template <typename T>
@ -164,7 +240,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
template <typename T>
@ -175,7 +251,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
template <typename T>
@ -186,7 +262,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
template <typename T>
@ -197,7 +273,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric matrix-vector multiplication: SSYMV/DSYMV
template <typename T>
@ -208,7 +284,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
template <typename T>
@ -219,7 +295,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
template <typename T>
@ -230,7 +306,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
template <typename T>
@ -238,7 +314,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
template <typename T>
@ -246,7 +322,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_
const size_t n, const size_t k,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
template <typename T>
@ -254,7 +330,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_
const size_t n,
const cl_mem ap_buffer, const size_t ap_offset,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
template <typename T>
@ -262,7 +338,7 @@ StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
template <typename T>
@ -270,7 +346,7 @@ StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_
const size_t n, const size_t k,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
template <typename T>
@ -278,7 +354,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_
const size_t n,
const cl_mem ap_buffer, const size_t ap_offset,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// General rank-1 matrix update: SGER/DGER
template <typename T>
@ -288,7 +364,7 @@ StatusCode Ger(const Layout layout,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// General rank-1 complex matrix update: CGERU/ZGERU
template <typename T>
@ -298,7 +374,7 @@ StatusCode Geru(const Layout layout,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// General rank-1 complex conjugated matrix update: CGERC/ZGERC
template <typename T>
@ -308,7 +384,7 @@ StatusCode Gerc(const Layout layout,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Hermitian rank-1 matrix update: CHER/ZHER
template <typename T>
@ -317,7 +393,7 @@ StatusCode Her(const Layout layout, const Triangle triangle,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Hermitian packed rank-1 matrix update: CHPR/ZHPR
template <typename T>
@ -326,7 +402,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Hermitian rank-2 matrix update: CHER2/ZHER2
template <typename T>
@ -336,7 +412,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
template <typename T>
@ -346,7 +422,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric rank-1 matrix update: SSYR/DSYR
template <typename T>
@ -355,7 +431,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric packed rank-1 matrix update: SSPR/DSPR
template <typename T>
@ -364,7 +440,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric rank-2 matrix update: SSYR2/DSYR2
template <typename T>
@ -374,7 +450,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
template <typename T>
@ -384,7 +460,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
@ -399,7 +475,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
template <typename T>
@ -410,7 +486,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
template <typename T>
@ -421,7 +497,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
template <typename T>
@ -431,7 +507,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Rank-K update of a hermitian matrix: CHERK/ZHERK
template <typename T>
@ -441,7 +517,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
template <typename T>
@ -452,7 +528,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
template <typename T, typename U>
@ -463,7 +539,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const U beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
template <typename T>
@ -472,7 +548,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
template <typename T>
@ -481,9 +557,20 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
cl_command_queue* queue, cl_event* event = nullptr);
// =================================================================================================
// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
// for the same device. This cache can be cleared to free up system memory or in case of debugging.
StatusCode ClearCache();
// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels.
// Further CLBlast routine calls will then run at maximum speed.
StatusCode FillCache(const cl_device_id device);
// =================================================================================================
} // namespace clblast
// CLBLAST_CLBLAST_H_

File diff suppressed because it is too large Load diff

100
include/internal/cache.h Normal file
View file

@ -0,0 +1,100 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the caching functionality of compiled binaries and programs.
//
// =================================================================================================
#ifndef CLBLAST_CACHE_H_
#define CLBLAST_CACHE_H_
#include <string>
#include <vector>
#include <mutex>
#include "internal/utilities.h"
namespace clblast {
namespace cache {
// =================================================================================================
// The cache of compiled OpenCL binaries, along with some meta-data
struct BinaryCache {
std::string binary;
std::string device_name;
Precision precision;
std::string routine_name_;
// Finds out whether the properties match
bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
const std::string &ref_routine) {
return (device_name == ref_device &&
precision == ref_precision &&
routine_name_ == ref_routine);
}
};
// The actual cache, implemented as a vector of the above data-type, and its mutex
static std::vector<BinaryCache> binary_cache_;
static std::mutex binary_cache_mutex_;
// =================================================================================================
// The cache of compiled OpenCL programs, along with some meta-data
struct ProgramCache {
Program program;
ContextPointer context_ptr;
Precision precision;
std::string routine_name_;
// Finds out whether the properties match
bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision,
const std::string &ref_routine) {
return (context_ptr == ref_context &&
precision == ref_precision &&
routine_name_ == ref_routine);
}
};
// The actual cache, implemented as a vector of the above data-type, and its mutex
static std::vector<ProgramCache> program_cache_;
static std::mutex program_cache_mutex_;
// =================================================================================================
// Stores the compiled binary or program in the cache
void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
const Precision &precision, const std::string &routine_name);
void StoreProgramToCache(const Program &program, const Context &context,
const Precision &precision, const std::string &routine_name);
// Queries the cache and retrieves a matching binary or program. Assumes that the match is
// available, throws otherwise.
const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
const std::string &routine_name);
const Program& GetProgramFromCache(const Context &context, const Precision &precision,
const std::string &routine_name);
// Queries the cache to see whether or not the compiled kernel is already there
bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
const std::string &routine_name);
bool ProgramIsInCache(const Context &context, const Precision &precision,
const std::string &routine_name);
// =================================================================================================
// Clears the cache of stored binaries
StatusCode ClearCache();
// =================================================================================================
} // namespace cache
} // namespace clblast
// CLBLAST_CACHE_H_
#endif

View file

@ -78,11 +78,16 @@ class Event {
// Regular constructor
explicit Event(): event_(nullptr) { }
// Waits for completion of this event
void WaitForCompletion() const {
CheckError(clWaitForEvents(1, &event_));
}
// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
// the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
float GetElapsedTime() const {
CheckError(clWaitForEvents(1, &event_));
WaitForCompletion();
auto bytes = size_t{0};
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
auto time_start = size_t{0};
@ -95,10 +100,14 @@ class Event {
// Accessor to the private data-member
cl_event& operator()() { return event_; }
cl_event* pointer() { return &event_; }
private:
cl_event event_;
};
// Pointer to an OpenCL event
using EventPointer = cl_event*;
// =================================================================================================
// C++11 version of 'cl_platform_id'
@ -260,10 +269,14 @@ class Context {
// Accessor to the private data-member
const cl_context& operator()() const { return *context_; }
cl_context* pointer() const { return &(*context_); }
private:
std::shared_ptr<cl_context> context_;
};
// Pointer to an OpenCL context
using ContextPointer = cl_context*;
// =================================================================================================
// Enumeration of build statuses of the run-time compilation process
@ -274,7 +287,7 @@ class Program {
public:
// Note that there is no constructor based on the regular OpenCL data-type because of extra state
// Regular constructor with memory management
// Source-based constructor with memory management
explicit Program(const Context &context, std::string source):
program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
length_(source.length()),
@ -285,6 +298,22 @@ class Program {
CheckError(status);
}
// Binary-based constructor with memory management
explicit Program(const Device &device, const Context &context, const std::string& binary):
program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
length_(binary.length()),
source_(binary),
source_ptr_(&source_[0]) {
auto status1 = CL_SUCCESS;
auto status2 = CL_SUCCESS;
const cl_device_id dev = device();
*program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
reinterpret_cast<const unsigned char**>(&source_ptr_),
&status1, &status2);
CheckError(status1);
CheckError(status2);
}
// Compiles the device program and returns whether or not there where any warnings/errors
BuildStatus Build(const Device &device, std::vector<std::string> &options) {
auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
@ -313,7 +342,7 @@ class Program {
return result;
}
// Retrieves an intermediate representation of the compiled program
// Retrieves a binary or an intermediate representation of the compiled program
std::string GetIR() const {
auto bytes = size_t{0};
CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
@ -329,7 +358,7 @@ class Program {
private:
std::shared_ptr<cl_program> program_;
size_t length_;
std::string source_;
std::string source_; // Note: the source can also be a binary or IR
const char* source_ptr_;
};
@ -468,31 +497,33 @@ class Buffer {
}
// Copies from device to host: reading the device buffer a-synchronously
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) {
const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
ReadAsync(queue, size, host.data(), offset);
}
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) {
const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
ReadAsync(queue, size, host.data(), offset);
}
// Copies from device to host: reading the device buffer
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
ReadAsync(queue, size, host, offset);
queue.Finish();
}
void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
void Read(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) const {
Read(queue, size, host.data(), offset);
}
void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) const {
Read(queue, size, host.data(), offset);
}
@ -601,17 +632,37 @@ class Kernel {
// Launches a kernel onto the specified queue
void Launch(const Queue &queue, const std::vector<size_t> &global,
const std::vector<size_t> &local, Event &event) {
const std::vector<size_t> &local, EventPointer event) {
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
nullptr, global.data(), local.data(),
0, nullptr, &(event())));
0, nullptr, event));
}
// As above, but with an event waiting list
void Launch(const Queue &queue, const std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event,
std::vector<Event>& waitForEvents) {
if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
// Builds a plain version of the events waiting list
auto waitForEventsPlain = std::vector<cl_event>();
for (auto &waitEvent : waitForEvents) {
waitForEventsPlain.push_back(waitEvent());
}
// Launches the kernel while waiting for other events
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
nullptr, global.data(), local.data(),
static_cast<cl_uint>(waitForEventsPlain.size()),
waitForEventsPlain.data(),
event));
}
// As above, but with the default local workgroup size
void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
nullptr, global.data(), nullptr,
0, nullptr, &(event())));
0, nullptr, event));
}
// Accessor to the private data-member

View file

@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::CopySingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // ARM GPUs
@ -78,6 +80,8 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
@ -129,6 +133,8 @@ const Database::DatabaseEntry Database::CopyDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
@ -181,8 +187,10 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
{ // ARM GPUs

View file

@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::PadSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
@ -78,8 +80,10 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@ -124,7 +128,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
}
@ -137,8 +141,10 @@ const Database::DatabaseEntry Database::PadDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@ -189,6 +195,8 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}

View file

@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
@ -78,8 +80,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
}
},
{ // ARM GPUs
@ -137,8 +141,10 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
}
},
{ // ARM GPUs
@ -189,6 +195,8 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
}

View file

@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::TransposeSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
{ // ARM GPUs
@ -78,6 +80,8 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
@ -131,8 +135,10 @@ const Database::DatabaseEntry Database::TransposeDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
{ // ARM GPUs
@ -183,6 +189,8 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}

View file

@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::XaxpySingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
{ "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
@ -78,6 +80,8 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } },
{ "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
{ "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
@ -137,6 +141,8 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
@ -171,12 +177,12 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
}
@ -189,8 +195,10 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
{ // ARM GPUs

View file

@ -18,54 +18,38 @@ const Database::DatabaseEntry Database::XdotSingle = {
"Xdot", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
{ "Tahiti", { {"VW",1}, {"WGS1",256}, {"WGS2",256} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } },
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",1024}, {"WGS2",32} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"VW",1}, {"WGS1",512}, {"WGS2",32} } },
{ "Iris Pro", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Iris Pro", { {"WGS1",512}, {"WGS2",64} } },
{ "default", { {"WGS1",512}, {"WGS2",64} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",256}, {"WGS2",128} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
}
@ -77,54 +61,38 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
"Xdot", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",1024}, {"WGS2",32} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "Iris Pro", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Iris Pro", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",32}, {"WGS2",32} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "Tesla K20m", { {"VW",1}, {"WGS1",256}, {"WGS2",512} } },
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",32}, {"WGS2",32} } },
}
},
}
@ -136,47 +104,32 @@ const Database::DatabaseEntry Database::XdotDouble = {
"Xdot", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } },
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",1024}, {"WGS2",512} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",64} } },
{ "default", { {"WGS1",512}, {"WGS2",64} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",32}, {"WGS2",512} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "Tesla K40m", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
}
},
}
@ -188,47 +141,32 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
"Xdot", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
}
},
{ // ARM GPUs
kDeviceTypeGPU, "ARM", {
{ "Mali-T628", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
}
},
{ // Intel CPUs
kDeviceTypeCPU, "Intel", {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
}
},
{ // Intel accelerators
kDeviceTypeAccelerator, "Intel", {
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",1024}, {"WGS2",32} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
}
},
}

View file

@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::XgemmSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
{ "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // ARM GPUs
@ -60,12 +62,12 @@ const Database::DatabaseEntry Database::XgemmSingle = {
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
}
@ -78,8 +80,10 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // ARM GPUs
@ -100,7 +104,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
kDeviceTypeGPU, "Intel", {
{ "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Intel accelerators
@ -119,12 +123,12 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
}
@ -137,8 +141,10 @@ const Database::DatabaseEntry Database::XgemmDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
}
},
{ // ARM GPUs
@ -171,12 +177,12 @@ const Database::DatabaseEntry Database::XgemmDouble = {
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
}
@ -189,8 +195,10 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // ARM GPUs
@ -222,12 +230,12 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
}

View file

@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::XgemvSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
@ -71,8 +73,10 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Pitcairn", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Intel CPUs
@ -119,6 +123,8 @@ const Database::DatabaseEntry Database::XgemvDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
@ -164,6 +170,8 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
{ "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}

View file

@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::XgerSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -65,8 +67,10 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
{ "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -111,8 +115,10 @@ const Database::DatabaseEntry Database::XgerDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -138,7 +144,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
}
},
}
@ -151,6 +157,8 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
}

View file

@ -0,0 +1,34 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file provides macro's to define the public API. This is needed when building a Windows DLL.
// Note: this is only used for the C++ interface, the C interface has its own definition included in
// the header file itself.
//
// =================================================================================================
#ifndef CLBLAST_PUBLIC_API_H_
#define CLBLAST_PUBLIC_API_H_
namespace clblast {
// =================================================================================================
// Exports library functions under Windows when building a DLL. See also:
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
#ifdef _WIN32
#define PUBLIC_API __declspec(dllexport)
#else
#define PUBLIC_API
#endif
// =================================================================================================
} // namespace clblast
// CLBLAST_PUBLIC_API_H_
#endif

View file

@ -19,6 +19,7 @@
#include <string>
#include <vector>
#include "internal/cache.h"
#include "internal/utilities.h"
#include "internal/database.h"
@ -30,30 +31,11 @@ template <typename T>
class Routine {
public:
// The cache of compiled OpenCL programs, along with some meta-data
struct ProgramCache {
Program program;
std::string device_name;
Precision precision;
std::string routine_name_;
// Finds out whether the properties match
bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
const std::string &ref_routine) {
return (device_name == ref_device &&
precision == ref_precision &&
routine_name_ == ref_routine);
}
};
// The actual cache, implemented as a vector of the above data-type
static std::vector<ProgramCache> program_cache_;
// Helper functions which check for errors in the status code
static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
// Base class constructor
explicit Routine(Queue &queue, Event &event, const std::string &name,
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision);
// Set-up phase of the kernel
@ -63,7 +45,12 @@ class Routine {
// Runs a kernel given the global and local thread sizes
StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
const std::vector<size_t> &local);
const std::vector<size_t> &local, EventPointer event,
std::vector<Event>& waitForEvents);
// As above, but without an event waiting list
StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event);
// Tests for valid inputs of matrices A, B, and C
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
@ -75,17 +62,22 @@ class Routine {
StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer,
const size_t offset, const size_t data_size);
// Tests for valid inputs of vectors X and Y
// Tests for valid inputs of vector X and Y
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc, const size_t data_size);
StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc, const size_t data_size);
// Tests for valid inputs of other vectors
StatusCode TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t data_size);
StatusCode TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
const size_t offset, const size_t data_size);
// Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
// to symmetric and triangular matrices through optional arguments.
StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
@ -95,12 +87,30 @@ class Routine {
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false);
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
// The first assumes that the program is available in the cache and will throw an exception
// otherwise.
const Program& GetProgramFromCache() const;
bool ProgramIsInCache() const;
// Stores a newly compiled binary/program into the cache
void StoreBinaryToCache(const std::string& binary) const {
cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
}
void StoreProgramToCache(const Program& program) const {
cache::StoreProgramToCache(program, context_, precision_, routine_name_);
}
// Queries the cache and retrieve either a matching binary/program or a boolean whether a match
// exists. The first assumes that the binary/program is available in the cache and will throw an
// exception otherwise.
std::string GetBinaryFromCache() const {
return cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
}
Program GetProgramFromCache() const {
return cache::GetProgramFromCache(context_, precision_, routine_name_);
}
bool BinaryIsInCache() const {
return cache::BinaryIsInCache(device_name_, precision_, routine_name_);
}
bool ProgramIsInCache() const {
return cache::ProgramIsInCache(context_, precision_, routine_name_);
}
// Non-static variable for the precision. Note that the same variable (but static) might exist in
// a derived class.
@ -112,7 +122,7 @@ class Routine {
// The OpenCL objects, accessible only from derived classes
Queue queue_;
Event event_;
EventPointer event_;
const Context context_;
const Device device_;

View file

@ -0,0 +1,56 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xamax routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XAMAX_H_
#define CLBLAST_ROUTINES_XAMAX_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xamax: public Routine<T> {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorIndex;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
// Templated-precision implementation of the routine
StatusCode DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XAMAX_H_
#endif

View file

@ -0,0 +1,56 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xasum routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XASUM_H_
#define CLBLAST_ROUTINES_XASUM_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xasum: public Routine<T> {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorDot;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
// Templated-precision implementation of the routine
StatusCode DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XASUM_H_
#endif

View file

@ -28,6 +28,7 @@ class Xaxpy: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@ -35,7 +36,7 @@ class Xaxpy: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xaxpy(Queue &queue, Event &event, const std::string &name = "AXPY");
Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
// Templated-precision implementation of the routine
StatusCode DoAxpy(const size_t n, const T alpha,

View file

@ -28,6 +28,7 @@ class Xcopy: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@ -35,7 +36,7 @@ class Xcopy: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xcopy(Queue &queue, Event &event, const std::string &name = "COPY");
Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
// Templated-precision implementation of the routine
StatusCode DoCopy(const size_t n,

View file

@ -28,6 +28,7 @@ class Xdot: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
@ -37,7 +38,7 @@ class Xdot: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xdot(Queue &queue, Event &event, const std::string &name = "DOT");
Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
// Templated-precision implementation of the routine
StatusCode DoDot(const size_t n,

View file

@ -28,7 +28,7 @@ class Xdotc: public Xdot<T> {
using Xdot<T>::DoDot;
// Constructor
Xdotc(Queue &queue, Event &event, const std::string &name = "DOTC");
Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
// Templated-precision implementation of the routine
StatusCode DoDotc(const size_t n,

View file

@ -28,7 +28,7 @@ class Xdotu: public Xdot<T> {
using Xdot<T>::DoDot;
// Constructor
Xdotu(Queue &queue, Event &event, const std::string &name = "DOTU");
Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
// Templated-precision implementation of the routine
StatusCode DoDotu(const size_t n,

View file

@ -0,0 +1,49 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xmax routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XMAX_H_
#define CLBLAST_ROUTINES_XMAX_H_
#include "internal/routine.h"
#include "internal/routines/level1/xamax.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xmax: public Xamax<T> {
public:
// Members and methods from the base class
using Xamax<T>::DoAmax;
// Constructor
Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"):
Xamax<T>(queue, event, name) {
}
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoMax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
}
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XMAX_H_
#endif

View file

@ -0,0 +1,49 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xmin routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XMIN_H_
#define CLBLAST_ROUTINES_XMIN_H_
#include "internal/routine.h"
#include "internal/routines/level1/xamax.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xmin: public Xamax<T> {
public:
// Members and methods from the base class
using Xamax<T>::DoAmax;
// Constructor
Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"):
Xamax<T>(queue, event, name) {
}
// Forwards to the regular max-absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoMin(const size_t n,
const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
}
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XMIN_H_
#endif

View file

@ -0,0 +1,56 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xnrm2 routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XNRM2_H_
#define CLBLAST_ROUTINES_XNRM2_H_
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xnrm2: public Routine<T> {
public:
// Members and methods from the base class
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorDot;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
// Templated-precision implementation of the routine
StatusCode DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
private:
// Static variable to get the precision
const static Precision precision_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XNRM2_H_
#endif

View file

@ -28,13 +28,14 @@ class Xscal: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::RunKernel;
using Routine<T>::ErrorIn;
// Constructor
Xscal(Queue &queue, Event &event, const std::string &name = "SCAL");
Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
// Templated-precision implementation of the routine
StatusCode DoScal(const size_t n, const T alpha,

View file

@ -0,0 +1,49 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xsum routine. The precision is implemented using a template argument.
//
// =================================================================================================
#ifndef CLBLAST_ROUTINES_XSUM_H_
#define CLBLAST_ROUTINES_XSUM_H_
#include "internal/routine.h"
#include "internal/routines/level1/xasum.h"
namespace clblast {
// =================================================================================================
// See comment at top of file for a description of the class
template <typename T>
class Xsum: public Xasum<T> {
public:
// Members and methods from the base class
using Xasum<T>::DoAsum;
// Constructor
Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"):
Xasum<T>(queue, event, name) {
}
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoSum(const size_t n,
const Buffer<T> &sum_buffer, const size_t sum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
}
};
// =================================================================================================
} // namespace clblast
// CLBLAST_ROUTINES_XSUM_H_
#endif

View file

@ -28,6 +28,7 @@ class Xswap: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@ -35,7 +36,7 @@ class Xswap: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xswap(Queue &queue, Event &event, const std::string &name = "SWAP");
Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
// Templated-precision implementation of the routine
StatusCode DoSwap(const size_t n,

View file

@ -30,7 +30,7 @@ class Xgbmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV");
Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
// Templated-precision implementation of the routine
StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,

View file

@ -28,6 +28,7 @@ class Xgemv: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@ -37,7 +38,7 @@ class Xgemv: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xgemv(Queue &queue, Event &event, const std::string &name = "GEMV");
Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
// Templated-precision implementation of the routine
StatusCode DoGemv(const Layout layout, const Transpose a_transpose,

View file

@ -28,6 +28,7 @@ class Xger: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@ -36,7 +37,7 @@ class Xger: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xger(Queue &queue, Event &event, const std::string &name = "GER");
Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
// Templated-precision implementation of the routine
StatusCode DoGer(const Layout layout,

View file

@ -28,7 +28,7 @@ class Xgerc: public Xger<T> {
using Xger<T>::DoGer;
// Constructor
Xgerc(Queue &queue, Event &event, const std::string &name = "GERC");
Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
// Templated-precision implementation of the routine
StatusCode DoGerc(const Layout layout,

View file

@ -28,7 +28,7 @@ class Xgeru: public Xger<T> {
using Xger<T>::DoGer;
// Constructor
Xgeru(Queue &queue, Event &event, const std::string &name = "GERU");
Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
// Templated-precision implementation of the routine
StatusCode DoGeru(const Layout layout,

View file

@ -30,7 +30,7 @@ class Xhbmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
Xhbmv(Queue &queue, Event &event, const std::string &name = "HBMV");
Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
// Templated-precision implementation of the routine
StatusCode DoHbmv(const Layout layout, const Triangle triangle,

View file

@ -30,7 +30,7 @@ class Xhemv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
Xhemv(Queue &queue, Event &event, const std::string &name = "HEMV");
Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
// Templated-precision implementation of the routine
StatusCode DoHemv(const Layout layout, const Triangle triangle,

View file

@ -28,6 +28,7 @@ class Xher: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestMatrixA;
@ -36,7 +37,7 @@ class Xher: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xher(Queue &queue, Event &event, const std::string &name = "HER");
Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
// Translates alpha of type 'U' into type 'T'
T GetAlpha(const U alpha);

View file

@ -28,6 +28,7 @@ class Xher2: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
@ -37,7 +38,7 @@ class Xher2: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xher2(Queue &queue, Event &event, const std::string &name = "HER2");
Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
// Templated-precision implementation of the routine
StatusCode DoHer2(const Layout layout, const Triangle triangle,

View file

@ -30,7 +30,7 @@ class Xhpmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
Xhpmv(Queue &queue, Event &event, const std::string &name = "HPMV");
Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
// Templated-precision implementation of the routine
StatusCode DoHpmv(const Layout layout, const Triangle triangle,

View file

@ -28,7 +28,7 @@ class Xhpr: public Xher<T,U> {
using Xher<T,U>::DoHer;
// Constructor
Xhpr(Queue &queue, Event &event, const std::string &name = "HPR");
Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
// Templated-precision implementation of the routine
StatusCode DoHpr(const Layout layout, const Triangle triangle,

View file

@ -28,7 +28,7 @@ class Xhpr2: public Xher2<T> {
using Xher2<T>::DoHer2;
// Constructor
Xhpr2(Queue &queue, Event &event, const std::string &name = "HPR2");
Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
// Templated-precision implementation of the routine
StatusCode DoHpr2(const Layout layout, const Triangle triangle,

View file

@ -30,7 +30,7 @@ class Xsbmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
Xsbmv(Queue &queue, Event &event, const std::string &name = "SBMV");
Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
// Templated-precision implementation of the routine
StatusCode DoSbmv(const Layout layout, const Triangle triangle,

View file

@ -30,7 +30,7 @@ class Xspmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
Xspmv(Queue &queue, Event &event, const std::string &name = "SPMV");
Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
// Templated-precision implementation of the routine
StatusCode DoSpmv(const Layout layout, const Triangle triangle,

View file

@ -28,7 +28,7 @@ class Xspr: public Xher<T,T> {
using Xher<T,T>::DoHer;
// Constructor
Xspr(Queue &queue, Event &event, const std::string &name = "SPR");
Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
// Templated-precision implementation of the routine
StatusCode DoSpr(const Layout layout, const Triangle triangle,

View file

@ -28,7 +28,7 @@ class Xspr2: public Xher2<T> {
using Xher2<T>::DoHer2;
// Constructor
Xspr2(Queue &queue, Event &event, const std::string &name = "SPR2");
Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
// Templated-precision implementation of the routine
StatusCode DoSpr2(const Layout layout, const Triangle triangle,

View file

@ -30,7 +30,7 @@ class Xsymv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
Xsymv(Queue &queue, Event &event, const std::string &name = "SYMV");
Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");
// Templated-precision implementation of the routine
StatusCode DoSymv(const Layout layout, const Triangle triangle,

View file

@ -28,7 +28,7 @@ class Xsyr: public Xher<T,T> {
using Xher<T,T>::DoHer;
// Constructor
Xsyr(Queue &queue, Event &event, const std::string &name = "SYR");
Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");
// Templated-precision implementation of the routine
StatusCode DoSyr(const Layout layout, const Triangle triangle,

View file

@ -28,7 +28,7 @@ class Xsyr2: public Xher2<T> {
using Xher2<T>::DoHer2;
// Constructor
Xsyr2(Queue &queue, Event &event, const std::string &name = "SYR2");
Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");
// Templated-precision implementation of the routine
StatusCode DoSyr2(const Layout layout, const Triangle triangle,

View file

@ -34,7 +34,7 @@ class Xtbmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV");
Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");
// Templated-precision implementation of the routine
StatusCode DoTbmv(const Layout layout, const Triangle triangle,

View file

@ -34,7 +34,7 @@ class Xtpmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV");
Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");
// Templated-precision implementation of the routine
StatusCode DoTpmv(const Layout layout, const Triangle triangle,

View file

@ -34,7 +34,7 @@ class Xtrmv: public Xgemv<T> {
using Xgemv<T>::MatVec;
// Constructor
Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV");
Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");
// Templated-precision implementation of the routine
StatusCode DoTrmv(const Layout layout, const Triangle triangle,

View file

@ -28,6 +28,7 @@ class Xgemm: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::PadCopyTransposeMatrix;
@ -38,7 +39,7 @@ class Xgemm: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xgemm(Queue &queue, Event &event, const std::string &name = "GEMM");
Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
// Templated-precision implementation of the routine
StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,

View file

@ -37,7 +37,7 @@ class Xhemm: public Xgemm<T> {
using Xgemm<T>::DoGemm;
// Constructor
Xhemm(Queue &queue, Event &event, const std::string &name = "HEMM");
Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");
// Templated-precision implementation of the routine
StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,

View file

@ -30,6 +30,7 @@ class Xher2k: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::PadCopyTransposeMatrix;
@ -40,7 +41,7 @@ class Xher2k: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xher2k(Queue &queue, Event &event, const std::string &name = "HER2K");
Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
// Templated-precision implementation of the routine
StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,

View file

@ -30,6 +30,7 @@ class Xherk: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::PadCopyTransposeMatrix;
@ -39,7 +40,7 @@ class Xherk: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xherk(Queue &queue, Event &event, const std::string &name = "HERK");
Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
// Templated-precision implementation of the routine
StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,

View file

@ -39,7 +39,7 @@ class Xsymm: public Xgemm<T> {
using Xgemm<T>::DoGemm;
// Constructor
Xsymm(Queue &queue, Event &event, const std::string &name = "SYMM");
Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");
// Templated-precision implementation of the routine
StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,

View file

@ -30,6 +30,7 @@ class Xsyr2k: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::PadCopyTransposeMatrix;
@ -40,7 +41,7 @@ class Xsyr2k: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xsyr2k(Queue &queue, Event &event, const std::string &name = "SYR2K");
Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
// Templated-precision implementation of the routine
StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,

View file

@ -32,6 +32,7 @@ class Xsyrk: public Routine<T> {
using Routine<T>::db_;
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::PadCopyTransposeMatrix;
@ -41,7 +42,7 @@ class Xsyrk: public Routine<T> {
using Routine<T>::ErrorIn;
// Constructor
Xsyrk(Queue &queue, Event &event, const std::string &name = "SYRK");
Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
// Templated-precision implementation of the routine
StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,

View file

@ -38,7 +38,7 @@ class Xtrmm: public Xgemm<T> {
using Xgemm<T>::DoGemm;
// Constructor
Xtrmm(Queue &queue, Event &event, const std::string &name = "TRMM");
Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM");
// Templated-precision implementation of the routine
StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,

View file

@ -35,6 +35,9 @@ using double2 = std::complex<double>;
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
// Catched an unknown error
constexpr auto kUnknownError = -999;
// =================================================================================================
// The routine-specific arguments in string form
@ -61,6 +64,9 @@ constexpr auto kArgBOffset = "offb";
constexpr auto kArgCOffset = "offc";
constexpr auto kArgAPOffset = "offap";
constexpr auto kArgDotOffset = "offdot";
constexpr auto kArgNrm2Offset = "offnrm2";
constexpr auto kArgAsumOffset = "offasum";
constexpr auto kArgImaxOffset = "offimax";
constexpr auto kArgAlpha = "alpha";
constexpr auto kArgBeta = "beta";
@ -69,12 +75,14 @@ constexpr auto kArgFraction = "fraction";
// The client-specific arguments in string form
constexpr auto kArgCompareclblas = "clblas";
constexpr auto kArgComparecblas = "cblas";
constexpr auto kArgStepSize = "step";
constexpr auto kArgNumSteps = "num_steps";
constexpr auto kArgNumRuns = "runs";
// The client-specific arguments in string form
constexpr auto kArgFullTest = "full_test";
constexpr auto kArgVerbose = "verbose";
// The common arguments in string form
constexpr auto kArgPlatform = "platform";
@ -113,6 +121,9 @@ struct Arguments {
size_t c_offset = 0;
size_t ap_offset = 0;
size_t dot_offset = 0;
size_t nrm2_offset = 0;
size_t asum_offset = 0;
size_t imax_offset = 0;
T alpha = T{1.0};
T beta = T{1.0};
size_t x_size = 1;
@ -121,16 +132,15 @@ struct Arguments {
size_t b_size = 1;
size_t c_size = 1;
size_t ap_size = 1;
size_t dot_size = 1;
size_t scalar_size = 1;
// Tuner-specific arguments
double fraction = 1.0;
// Client-specific arguments
int compare_clblas = 1;
int compare_cblas = 1;
size_t step = 1;
size_t num_steps = 0;
size_t num_runs = 10;
// Tester-specific arguments
bool full_test = false;
// Common arguments
size_t platform_id = 0;
size_t device_id = 0;
@ -149,7 +159,7 @@ struct Buffers {
Buffer<T> b_mat;
Buffer<T> c_mat;
Buffer<T> ap_mat;
Buffer<T> dot;
Buffer<T> scalar;
};
// =================================================================================================

133
samples/cache.c Normal file
View file

@ -0,0 +1,133 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file demonstrates the CLBlast kernel cache, which stores compiled OpenCL binaries for faster
// repeated kernel execution. The cache can be pre-initialized or cleared.
//
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
//
// =================================================================================================
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
// Includes the CLBlast library (C interface)
#include <clblast_c.h>
// Forward declaration
void run_example_routine(const cl_device_id device);
// =================================================================================================
// Example use of the CLBlast kernel cache
int main(void) {
// OpenCL platform/device settings
const size_t platform_id = 0;
const size_t device_id = 0;
// Initializes the OpenCL platform
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(num_platforms, platforms, NULL);
cl_platform_id platform = platforms[platform_id];
// Initializes the OpenCL device
cl_uint num_devices;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
cl_device_id device = devices[device_id];
// Run the routine multiple times in a row: after the first time the binary is already in the
// cache and compilation is no longer needed.
printf("Starting caching sample with an empty cache\n");
run_example_routine(device);
run_example_routine(device);
run_example_routine(device);
// Clearing the cache makes CLBlast re-compile the kernel once
printf("Clearing cache\n");
CLBlastClearCache();
run_example_routine(device);
run_example_routine(device);
// When the cache is empty, it can be pre-initialized with compiled kernels for all routines by
// calling the CLBlastFillCache function, such that all other CLBlast calls can benefit from
// pre-compiled kernels and thus execute at maximum speed.
printf("Clearing cache\n");
CLBlastClearCache();
printf("Filling cache (this might take a while)\n");
CLBlastFillCache(device);
run_example_routine(device);
// Clean-up
free(platforms);
free(devices);
return 0;
}
// =================================================================================================
// Runs an example routine and reports the time
void run_example_routine(const cl_device_id device) {
// Example SASUM arguments
const size_t n = 1024*128;
// Creates the OpenCL context, queue, and an event
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
cl_event event = NULL;
// Populate host data structures with some example data
float* host_input = (float*)malloc(sizeof(float)*n);
float* host_output = (float*)malloc(sizeof(float)*1);
for (size_t i=0; i<n; ++i) { host_input[i] = -1.5f; }
for (size_t i=0; i<1; ++i) { host_output[i] = 0.0f; }
// Copy the data-structures to the device
cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(float), NULL, NULL);
clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
// Start the timer
clock_t start = clock();
// Calls an example routine
StatusCode status = CLBlastSasum(n,
device_output, 0,
device_input, 0, 1,
&queue, &event);
// Wait for completion
clWaitForEvents(1, &event);
// Retrieves the execution time
clock_t diff = clock() - start;
double time_ms = diff * 1000.0f / (double)CLOCKS_PER_SEC;
// Routine completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed routine with status %d in %.3lf ms\n", status, time_ms);
// Clean-up
free(host_input);
free(host_output);
clReleaseMemObject(device_input);
clReleaseMemObject(device_output);
clReleaseCommandQueue(queue);
clReleaseContext(context);
}
// =================================================================================================

106
samples/dgemv.c Normal file
View file

@ -0,0 +1,106 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file demonstrates the use of the DGEMV routine. It is pure C99 and demonstrates the use of
// the C API to the CLBlast library.
//
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
//
// =================================================================================================
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
// Includes the CLBlast library (C interface)
#include <clblast_c.h>
// =================================================================================================
// Example use of the double-precision routine DGEMV
int main(void) {
// OpenCL platform/device settings
const size_t platform_id = 0;
const size_t device_id = 0;
// Example DGEMV arguments
const size_t m = 128;
const size_t n = 289;
const double alpha = 0.7;
const double beta = 0.0;
const size_t a_ld = n;
// Initializes the OpenCL platform
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(num_platforms, platforms, NULL);
cl_platform_id platform = platforms[platform_id];
// Initializes the OpenCL device
cl_uint num_devices;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
cl_device_id device = devices[device_id];
// Creates the OpenCL context, queue, and an event
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
cl_event event = NULL;
// Populate host data structures with some example data
double* host_a = (double*)malloc(sizeof(double)*m*n);
double* host_x = (double*)malloc(sizeof(double)*n);
double* host_y = (double*)malloc(sizeof(double)*m);
for (size_t i=0; i<m*n; ++i) { host_a[i] = 12.193; }
for (size_t i=0; i<n; ++i) { host_x[i] = -8.199; }
for (size_t i=0; i<m; ++i) { host_y[i] = 0.0; }
// Copy the data-structures to the device
cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, m*n*sizeof(double), NULL, NULL);
cl_mem device_x = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, NULL);
cl_mem device_y = clCreateBuffer(context, CL_MEM_READ_WRITE, m*sizeof(double), NULL, NULL);
clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, m*n*sizeof(double), host_a, 0, NULL, NULL);
clEnqueueWriteBuffer(queue, device_x, CL_TRUE, 0, n*sizeof(double), host_x, 0, NULL, NULL);
clEnqueueWriteBuffer(queue, device_y, CL_TRUE, 0, m*sizeof(double), host_y, 0, NULL, NULL);
// Call the DGEMV routine.
StatusCode status = CLBlastDgemv(kRowMajor, kNo,
m, n,
alpha,
device_a, 0, a_ld,
device_x, 0, 1,
beta,
device_y, 0, 1,
&queue, &event);
// Wait for completion
clWaitForEvents(1, &event);
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed DGEMV with status %d\n", status);
// Clean-up
free(platforms);
free(devices);
free(host_a);
free(host_x);
free(host_y);
clReleaseMemObject(device_a);
clReleaseMemObject(device_x);
clReleaseMemObject(device_y);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
// =================================================================================================

96
samples/sasum.c Normal file
View file

@ -0,0 +1,96 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file demonstrates the use of the SASUM routine. It is pure C99 and demonstrates the use of
// the C API to the CLBlast library.
//
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
//
// =================================================================================================
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
// Includes the CLBlast library (C interface)
#include <clblast_c.h>
// =================================================================================================
// Example use of the single-precision routine SASUM
int main(void) {
// OpenCL platform/device settings
const size_t platform_id = 0;
const size_t device_id = 0;
// Example SASUM arguments
const size_t n = 1000;
const float input_value = -1.5f;
// Initializes the OpenCL platform
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(num_platforms, platforms, NULL);
cl_platform_id platform = platforms[platform_id];
// Initializes the OpenCL device
cl_uint num_devices;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
cl_device_id device = devices[device_id];
// Creates the OpenCL context, queue, and an event
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
cl_event event = NULL;
// Populate host data structures with some example data
float* host_input = (float*)malloc(sizeof(float)*n);
float* host_output = (float*)malloc(sizeof(float)*1);
for (size_t i=0; i<n; ++i) { host_input[i] = input_value; }
for (size_t i=0; i<1; ++i) { host_output[i] = 0.0f; }
// Copy the data-structures to the device
cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(float), NULL, NULL);
clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
// Call the SASUM routine.
StatusCode status = CLBlastSasum(n,
device_output, 0,
device_input, 0, 1,
&queue, &event);
// Wait for completion
clWaitForEvents(1, &event);
// Copies the result back to the host
clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed SASUM with status %d: %zu * |%.1lf| = %.1lf\n", status, n, input_value, host_output[0]);
// Clean-up
free(platforms);
free(devices);
free(host_input);
free(host_output);
clReleaseMemObject(device_input);
clReleaseMemObject(device_output);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
// =================================================================================================

View file

@ -15,6 +15,7 @@
//
// =================================================================================================
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
@ -47,11 +48,11 @@ int main(void) {
clGetPlatformIDs(num_platforms, platforms, NULL);
cl_platform_id platform = platforms[platform_id];
// Initializes the OpenCL device (note: example for GPU devices only)
// Initializes the OpenCL device
cl_uint num_devices;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
cl_device_id device = devices[device_id];
// Creates the OpenCL context, queue, and an event
@ -89,7 +90,7 @@ int main(void) {
clWaitForEvents(1, &event);
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed with status %d\n", status);
printf("Completed SGEMM with status %d\n", status);
// Clean-up
free(platforms);

View file

@ -8,8 +8,8 @@
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does
// requires the Khronos C++ OpenCL API header file (not included). The example uses C++ features,
// but CLBlast can also be used using the regular C-style OpenCL API.
// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++
// features, but CLBlast can also be used using the regular C-style OpenCL API.
//
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
@ -22,7 +22,7 @@
// Includes the C++ OpenCL API. If not yet available, it can be found here:
// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
#include <cl.hpp>
#include "cl.hpp"
// Includes the CLBlast library
#include <clblast.h>
@ -52,16 +52,16 @@ int main() {
if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
auto platform = platforms[platform_id];
// Initializes the OpenCL device (note: example for GPU devices only)
// Initializes the OpenCL device
auto devices = std::vector<cl::Device>();
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
auto device = devices[device_id];
// Creates the OpenCL context, queue, and an event
auto context = cl::Context({device});
auto queue = cl::CommandQueue(context, device);
auto event = cl::Event();
auto event = cl_event{nullptr};
// Populate host matrices with some example data
auto host_a = std::vector<float>(m*k);
@ -84,24 +84,23 @@ int main() {
// Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision.
auto queue_plain = queue();
auto event_plain = event();
auto status = Gemm(clblast::Layout::kRowMajor,
clblast::Transpose::kNo, clblast::Transpose::kNo,
m, n, k,
alpha,
device_a(), 0, a_ld,
device_b(), 0, b_ld,
beta,
device_c(), 0, c_ld,
&queue_plain, &event_plain);
auto status = clblast::Gemm(clblast::Layout::kRowMajor,
clblast::Transpose::kNo, clblast::Transpose::kNo,
m, n, k,
alpha,
device_a(), 0, a_ld,
device_b(), 0, b_ld,
beta,
device_c(), 0, c_ld,
&queue_plain, &event);
// Record the execution time
event.wait();
clWaitForEvents(1, &event);
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
// Example completed. See "clblast.h" for status codes (0 -> success).
printf("Completed in %.3lf ms with status %d\n", time_ms, status);
printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, status);
return 0;
}

View file

@ -34,9 +34,9 @@ DEVICENAME_DEFAULT = "default"
# Attributes
DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"]
DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
KERNEL_ATTRIBUTES = ["precision", "kernel_family",
"arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES
KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
# OpenCL vendor names and their short name
VENDOR_NAMES = { "device_vendor": {
@ -95,9 +95,18 @@ def RemoveDuplicates(df):
def RemoveEntriesByDevice(df, devicename):
return df[df["device"] != devicename]
def RemoveEntriesByKernelFamily(df, familyname):
return df[df["kernel_family"] != familyname]
def GetEntriesByField(df, field, value):
return df[df[field] == value]
# Example usage:
# df = UpdateDatabase(df, (df["kernel_family"] == "xdot") & (df["arg_n"] == "67108864"), "arg_n", "2097152")
def UpdateDatabase(df, condition, field, value):
df.loc[condition, field] = value
return df
# Fixes the problem that some vendors use multiple different names
def SanitizeVendorNames(df):
df = df.replace(VENDOR_NAMES)
@ -120,7 +129,7 @@ def CalculateDefaults(df):
dfdefault = pd.DataFrame()
# Defaults per type/vendor
groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
for name, dfgroup in groups:
default_values = dfgroup.min(axis=0)
default_values["device"] = DEVICENAME_DEFAULT
@ -129,8 +138,14 @@ def CalculateDefaults(df):
default_values["time"] = 0.0
dfdefault = dfdefault.append(default_values, ignore_index=True)
# Checks for mis-matched arguments
groups = dfdefault.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
for name, dfgroup in groups:
if len(dfgroup) != 1:
print("[WARNING] Entries for a single kernel with multiple argument values")
# Defaults in general
groups = df.groupby(KERNEL_ATTRIBUTES+["kernel"])
groups = df.groupby(KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
for name, dfgroup in groups:
default_values = dfgroup.min(axis=0)
default_values["device_vendor"] = VENDOR_DEFAULT
@ -273,7 +288,6 @@ for file_json in glob.glob(glob_json):
new_size = len(database.index)
print("with "+str(new_size-old_size)+" new items")
# Stores the modified database back to disk
if len(glob.glob(glob_json)) >= 1:
print("## Storing the database to disk...")

View file

@ -22,7 +22,8 @@ D2CL = "cl_double2"
# Structure holding data-type and precision information
class DataType():
def __init__(self, name, template, scalars, buffertype):
def __init__(self, precision_name, name, template, scalars, buffertype):
self.precision_name = precision_name
self.name = name
self.template = template
self.alpha_cpp = scalars[0]
@ -57,5 +58,10 @@ class DataType():
return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp
# Current scalar is complex
def IsComplex(self, scalar):
return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or
(scalar == "beta" and self.beta_cpp in [FLT2, DBL2]))
# ==================================================================================================

View file

@ -8,15 +8,17 @@
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This script automatically generates the bodies of the following files, creating the full CLBlast
# API interface and implementation (C, C++, and clBLAS wrapper):
# API interface and implementation (C, C++, and reference BLAS wrappers):
# clblast.h
# clblast.cc
# clblast_c.h
# clblast_c.cc
# wrapper_clblas.h
# wrapper_cblas.h
# It also generates the main functions for the correctness and performance tests as found in
# test/correctness/routines/levelX/xYYYY.cc
# test/performance/routines/levelX/xYYYY.cc
# It also produces the API documentation found in doc/clblast.md
#
# ==================================================================================================
@ -31,75 +33,89 @@ from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL
# ==================================================================================================
# Regular data-types
S = DataType("S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32)
D = DataType("D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64)
C = DataType("C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
Z = DataType("Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464)
S = DataType("S", "S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32)
D = DataType("D", "D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64)
C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464)
# Special cases
Css = DataType("C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S
Zdd = DataType("Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D
Ccs = DataType("C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S
Zzd = DataType("Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D
Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output
Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output
iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output
iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output
iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output
iZ = DataType("Z", "iZ", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # As Z, but with integer output
Css = DataType("C", "C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S
Zdd = DataType("Z", "Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D
Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S
Zzd = DataType("Z", "Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D
# C++ template data-types
T = DataType("typename T", "T", ["T", "T", "T", "T"], "T") # regular routine
Tc = DataType("typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk
TU = DataType("typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k
T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine
Tc = DataType("Tc", "typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk
TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k
# ==================================================================================================
# Populates a list of routines
routines = [
[ # Level 1: vector-vector
#Routine(False, "1", "rotg", T, [S,D], [], [], [], [], ["a","b","c","s"], False, "Generate plane rotation"),
#Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["c","s"], False, "Apply plane rotation"),
Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"),
Routine(True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"),
Routine(True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"),
Routine(True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], False, "Vector-times-constant plus vector"),
Routine(True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"),
Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"),
Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"),
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
Routine(True, True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []),
Routine(True, True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []),
Routine(True, True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []),
Routine(True, True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []),
Routine(True, True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []),
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []),
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []),
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []),
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
],
[ # Level 2: matrix-vector
Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"),
Routine(True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"),
Routine(True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"),
Routine(True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"),
Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"),
Routine(True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"),
Routine(True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"),
Routine(True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
Routine(True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
Routine(True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"),
Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []),
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []),
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []),
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []),
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []),
Routine(True, True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []),
Routine(True, True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []),
Routine(True, True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []),
Routine(True, True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []),
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []),
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []),
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []),
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
# Level 2: matrix update
Routine(True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
Routine(True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
Routine(True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
Routine(True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
Routine(True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
Routine(True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
Routine(True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
Routine(True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
Routine(True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
Routine(True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
Routine(True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
Routine(True, True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []),
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []),
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []),
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []),
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []),
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []),
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []),
Routine(True, True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []),
Routine(True, True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []),
Routine(True, True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []),
Routine(True, True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []),
],
[ # Level 3: matrix-matrix
Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"),
Routine(True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Symmetric matrix-matrix multiplication"),
Routine(True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Hermitian matrix-matrix multiplication"),
Routine(True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a symmetric matrix"),
Routine(True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a hermitian matrix"),
Routine(True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a symmetric matrix"),
Routine(True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a hermitian matrix"),
Routine(True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Triangular matrix-matrix multiplication"),
Routine(False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Solves a triangular system of equations"),
Routine(True, True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []),
Routine(True, True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []),
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []),
Routine(True, True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []),
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []),
Routine(True, True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []),
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []),
Routine(True, True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []),
Routine(False, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
]]
# ==================================================================================================
@ -151,7 +167,7 @@ def clblast_h(routines):
result = ""
for routine in routines:
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
result += routine.RoutineHeaderCPP(12)+";\n"
result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n"
return result
# The C++ API implementation (.cc)
@ -161,10 +177,9 @@ def clblast_cc(routines):
indent1 = " "*(20 + routine.Length())
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
if routine.implemented:
result += routine.RoutineHeaderCPP(12)+" {\n"
result += routine.RoutineHeaderCPP(12, "")+" {\n"
result += " auto queue_cpp = Queue(*queue);\n"
result += " auto event_cpp = Event(*event);\n"
result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event_cpp);\n"
result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event);\n"
result += " auto status = routine.SetUp();\n"
result += " if (status != StatusCode::kSuccess) { return status; }\n"
result += " return routine.Do"+routine.name.capitalize()+"("
@ -175,8 +190,8 @@ def clblast_cc(routines):
result += " return StatusCode::kNotImplemented;\n"
result += "}\n"
for flavour in routine.flavours:
indent2 = " "*(23 + routine.Length() + len(flavour.template))
result += "template StatusCode "+routine.name.capitalize()+"<"+flavour.template+">("
indent2 = " "*(34 + routine.Length() + len(flavour.template))
result += "template StatusCode PUBLIC_API "+routine.name.capitalize()+"<"+flavour.template+">("
result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)])
result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n"
return result
@ -189,7 +204,7 @@ def clblast_c_h(routines):
for routine in routines:
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
for flavour in routine.flavours:
result += routine.RoutineHeaderC(flavour, 20)+";\n"
result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n"
return result
# The C API implementation (.cc)
@ -200,7 +215,7 @@ def clblast_c_cc(routines):
for flavour in routine.flavours:
template = "<"+flavour.template+">" if routine.NoScalars() else ""
indent = " "*(26 + routine.Length() + len(template))
result += routine.RoutineHeaderC(flavour, 20)+" {\n"
result += routine.RoutineHeaderC(flavour, 20, "")+" {\n"
result += " auto status = clblast::"+routine.name.capitalize()+template+"("
result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)])
result += ",\n"+indent+"queue, event);"
@ -213,22 +228,68 @@ def clblast_c_cc(routines):
def wrapper_clblas(routines):
result = ""
for routine in routines:
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
if routine.NoScalars():
result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n"
for flavour in routine.flavours:
indent = " "*(17 + routine.Length())
result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n"
arguments = routine.ArgumentsWrapper(flavour)
if routine.scratch:
result += " auto queue = Queue(queues[0]);\n"
result += " auto context = queue.GetContext();\n"
result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n*x_inc + x_offset);\n"
arguments += ["scratch_buffer()"]
result += " return clblas"+flavour.name+routine.name+"("
result += (",\n"+indent).join([a for a in arguments])
result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
result += "\n}\n"
if routine.has_tests:
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
if routine.NoScalars():
result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
for flavour in routine.flavours:
indent = " "*(17 + routine.Length())
result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
arguments = routine.ArgumentsWrapperCL(flavour)
if routine.scratch:
result += " auto queue = Queue(queues[0]);\n"
result += " auto context = queue.GetContext();\n"
result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
arguments += ["scratch_buffer()"]
result += " return clblas"+flavour.name+routine.name+"("
result += (",\n"+indent).join([a for a in arguments])
result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
result += "\n}\n"
return result
# The wrapper to the reference CBLAS routines (for performance/correctness testing)
def wrapper_cblas(routines):
result = ""
for routine in routines:
if routine.has_tests:
result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames())
for flavour in routine.flavours:
indent = " "*(10 + routine.Length())
result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
arguments = routine.ArgumentsWrapperC(flavour)
# Double-precision scalars
for scalar in routine.scalars:
if flavour.IsComplex(scalar):
result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
# Special case for scalar outputs
assignment = ""
postfix = ""
endofline = ""
extra_argument = ""
for output_buffer in routine.outputs:
if output_buffer in routine.ScalarBuffersFirst():
if flavour in [C,Z]:
postfix += "_sub"
indent += " "
extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
elif output_buffer in routine.IndexBuffers():
assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
indent += " "*len(assignment)
else:
assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
if (flavour.name in ["Sc","Dz"]):
assignment = assignment+".real("
endofline += ")"
else:
assignment = assignment+" = "
indent += " "*len(assignment)
result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
result += (",\n"+indent).join([a for a in arguments])
result += extra_argument+endofline+");"
result += "\n}\n"
return result
# ==================================================================================================
@ -246,9 +307,10 @@ files = [
path_clblast+"/include/clblast_c.h",
path_clblast+"/src/clblast_c.cc",
path_clblast+"/test/wrapper_clblas.h",
path_clblast+"/test/wrapper_cblas.h",
]
header_lines = [84, 63, 80, 24, 22]
footer_lines = [6, 3, 5, 2, 6]
header_lines = [84, 71, 93, 22, 29, 41]
footer_lines = [17, 71, 19, 14, 6, 6]
# Checks whether the command-line arguments are valid; exists otherwise
for f in files:
@ -282,6 +344,8 @@ for i in xrange(0,len(files)):
body += clblast_c_cc(routines[level-1])
if i == 4:
body += wrapper_clblas(routines[level-1])
if i == 5:
body += wrapper_cblas(routines[level-1])
f.write("".join(file_header))
f.write(body)
f.write("".join(file_footer))
@ -291,57 +355,117 @@ for i in xrange(0,len(files)):
# Outputs all the correctness-test implementations
for level in [1,2,3]:
for routine in routines[level-1]:
filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc"
with open(filename, "w") as f:
body = ""
body += "#include \"correctness/testblas.h\"\n"
body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
body += "// Shortcuts to the clblast namespace\n"
body += "using float2 = clblast::float2;\n"
body += "using double2 = clblast::double2;\n\n"
body += "// Main function (not within the clblast namespace)\n"
body += "int main(int argc, char *argv[]) {\n"
not_first = "false"
for flavour in routine.flavours:
body += " clblast::RunTests<clblast::TestX"+routine.name+flavour.TestTemplate()
body += ">(argc, argv, "+not_first+", \""+flavour.name+routine.name.upper()+"\");\n"
not_first = "true"
body += " return 0;\n"
body += "}\n"
f.write(header+"\n")
f.write(body)
f.write(footer)
if routine.has_tests:
filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc"
with open(filename, "w") as f:
body = ""
body += "#include \"correctness/testblas.h\"\n"
body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
body += "// Shortcuts to the clblast namespace\n"
body += "using float2 = clblast::float2;\n"
body += "using double2 = clblast::double2;\n\n"
body += "// Main function (not within the clblast namespace)\n"
body += "int main(int argc, char *argv[]) {\n"
not_first = "false"
for flavour in routine.flavours:
body += " clblast::RunTests<clblast::TestX"+routine.name+flavour.TestTemplate()
body += ">(argc, argv, "+not_first+", \""+flavour.name+routine.name.upper()+"\");\n"
not_first = "true"
body += " return 0;\n"
body += "}\n"
f.write(header+"\n")
f.write(body)
f.write(footer)
# Outputs all the performance-test implementations
for level in [1,2,3]:
for routine in routines[level-1]:
filename = path_clblast+"/test/performance/routines/level"+str(level)+"/x"+routine.name+".cc"
with open(filename, "w") as f:
body = ""
body += "#include \"performance/client.h\"\n"
body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
body += "// Shortcuts to the clblast namespace\n"
body += "using float2 = clblast::float2;\n"
body += "using double2 = clblast::double2;\n\n"
body += "// Main function (not within the clblast namespace)\n"
body += "int main(int argc, char *argv[]) {\n"
default = PrecisionToFullName(routine.flavours[0].name)
body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
for precision in ["H","S","D","C","Z"]:
body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":"
found = False
for flavour in routine.flavours:
if flavour.name == precision:
body += "\n clblast::RunClient<clblast::TestX"+routine.name+flavour.TestTemplate()
body += ">(argc, argv); break;\n"
found = True
if not found:
body += " throw std::runtime_error(\"Unsupported precision mode\");\n"
body += " }\n"
body += " return 0;\n"
body += "}\n"
f.write(header+"\n")
f.write(body)
f.write(footer)
if routine.has_tests:
filename = path_clblast+"/test/performance/routines/level"+str(level)+"/x"+routine.name+".cc"
with open(filename, "w") as f:
body = ""
body += "#include \"performance/client.h\"\n"
body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
body += "// Shortcuts to the clblast namespace\n"
body += "using float2 = clblast::float2;\n"
body += "using double2 = clblast::double2;\n\n"
body += "// Main function (not within the clblast namespace)\n"
body += "int main(int argc, char *argv[]) {\n"
default = PrecisionToFullName(routine.flavours[0].precision_name)
body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
for precision in ["H","S","D","C","Z"]:
body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":"
found = False
for flavour in routine.flavours:
if flavour.precision_name == precision:
body += "\n clblast::RunClient<clblast::TestX"+routine.name+flavour.TestTemplate()
body += ">(argc, argv); break;\n"
found = True
if not found:
body += " throw std::runtime_error(\"Unsupported precision mode\");\n"
body += " }\n"
body += " return 0;\n"
body += "}\n"
f.write(header+"\n")
f.write(body)
f.write(footer)
# ==================================================================================================
# Outputs the API documentation
filename = path_clblast+"/doc/clblast.md"
with open(filename, "w") as f:
# Outputs the header
f.write("CLBlast: API reference\n")
f.write("================\n")
f.write("\n\n")
# Loops over the routines
for level in [1,2,3]:
for routine in routines[level-1]:
if routine.implemented:
# Routine header
f.write("x"+routine.name.upper()+": "+routine.description+"\n")
f.write("-------------\n")
f.write("\n")
f.write(routine.details+"\n")
f.write("\n")
# Routine API
f.write("C++ API:\n")
f.write("```\n")
f.write(routine.RoutineHeaderCPP(12, "")+"\n")
f.write("```\n")
f.write("\n")
f.write("C API:\n")
f.write("```\n")
for flavour in routine.flavours:
f.write(routine.RoutineHeaderC(flavour, 20, "")+"\n")
f.write("```\n")
f.write("\n")
# Routine arguments
f.write("Arguments to "+routine.name.upper()+":\n")
f.write("\n")
for argument in routine.ArgumentsDoc():
f.write("* "+argument+"\n")
f.write("* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.\n")
f.write("* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.\n")
f.write("\n")
# Routine requirements
if len(routine.RequirementsDoc()) > 0:
f.write("Requirements for "+routine.name.upper()+":\n")
f.write("\n")
for requirement in routine.RequirementsDoc():
f.write("* "+requirement+"\n")
f.write("\n")
# Routine footer
f.write("\n\n")
# ==================================================================================================

View file

@ -28,7 +28,7 @@ def OptionToCLBlast(x):
}[x]
# As above, but for clBLAS data-types
def OptionToWrapper(x):
def OptionToWrapperCL(x):
return {
'layout': "clblasOrder",
'a_transpose': "clblasTranspose",
@ -39,16 +39,38 @@ def OptionToWrapper(x):
'diagonal': "clblasDiag",
}[x]
# Buffers without 'ld' or 'inc' parameter
NO_LD_INC = ["dot","ap"]
# As above, but for CBLAS data-types
def OptionToWrapperC(x):
return {
'layout': "CBLAS_ORDER",
'a_transpose': "CBLAS_TRANSPOSE",
'b_transpose': "CBLAS_TRANSPOSE",
'ab_transpose': "CBLAS_TRANSPOSE",
'side': "CBLAS_SIDE",
'triangle': "CBLAS_UPLO",
'diagonal': "CBLAS_DIAG",
}[x]
# Translates an option name to a documentation string
def OptionToDoc(x):
return {
'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.",
'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
'side': "The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142).",
'triangle': "The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).",
'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal.",
}[x]
# ==================================================================================================
# Class holding routine-specific information (e.g. name, which arguments, which precisions)
class Routine():
def __init__(self, implemented, level, name, template, flavours, sizes, options,
inputs, outputs, scalars, scratch, description):
def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
inputs, outputs, scalars, scratch, description, details, requirements):
self.implemented = implemented
self.has_tests = has_tests
self.level = level
self.name = name
self.template = template
@ -60,6 +82,26 @@ class Routine():
self.scalars = scalars
self.scratch = scratch # Scratch buffer (e.g. for xDOT)
self.description = description
self.details = details
self.requirements = requirements
# List of scalar buffers
def ScalarBuffersFirst(self):
return ["dot","nrm2","asum","sum","imax","imin"]
def ScalarBuffersSecond(self):
return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"]
# List of scalars other than alpha and beta
def OtherScalars(self):
return ["cos","sin"]
# List of buffers with unsigned int type
def IndexBuffers(self):
return ["imax","imin"]
# List of buffers without 'inc' or 'ld'
def BuffersWithoutLdInc(self):
return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"]
# Retrieves the number of characters in the routine's name
def Length(self):
@ -87,6 +129,12 @@ class Routine():
return ["ap","a","b","c"]
return ["y","c"]
# Distinguish between vectors and matrices
def BuffersVector(self):
return ["x","y"]
def BuffersMatrix(self):
return ["a","b","c","ap"]
# ==============================================================================================
# Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')
@ -94,7 +142,7 @@ class Routine():
if (name in self.inputs) or (name in self.outputs):
a = [name+"_buffer"]
b = [name+"_offset"]
c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else []
c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
@ -104,21 +152,32 @@ class Routine():
if (name in self.inputs) or (name in self.outputs):
a = [prefix+"cl_mem "+name+"_buffer"]
b = ["const size_t "+name+"_offset"]
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else []
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but as vectors
def BufferDefVector(self, name, flavour):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"]
b = ["const size_t "+name+"_offset"]
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but with Claduc buffers
def BufferCladuc(self, name):
if (name in self.inputs) or (name in self.outputs):
a = ["Buffer<"+self.template.buffertype+">("+name+"_buffer)"]
buffertype = "unsigned int" if (name in self.IndexBuffers()) else self.template.buffertype
a = ["Buffer<"+buffertype+">("+name+"_buffer)"]
b = [name+"_offset"]
c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else []
c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but with a static cast for clBLAS wrapper
def BufferWrapper(self, name):
def BufferWrapperCL(self, name):
if (name in self.inputs) or (name in self.outputs):
a = [name+"_buffer"]
b = [name+"_offset"]
@ -130,16 +189,47 @@ class Routine():
return [", ".join(a+b+c)]
return []
# As above but with a static cast for CBLAS wrapper
def BufferWrapperC(self, name, flavour):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
if name == "sy1":
a = [name+"_buffer["+name+"_offset]"]
elif flavour.precision_name in ["C","Z"]:
a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"]
else:
a = ["&"+name+"_buffer["+name+"_offset]"]
c = []
if (name in ["x","y"]):
c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
elif (name in ["a","b","c"]):
c = [name+"_"+self.Postfix(name)]
return [", ".join(a+c)]
return []
# As above, but only data-types
def BufferType(self, name):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix+"cl_mem"]
b = ["const size_t"]
c = ["const size_t"] if (name not in NO_LD_INC) else []
c = ["const size_t"] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# Retrieves the documentation of the buffers
def BufferDoc(self, name):
prefix = "const " if (name in self.inputs) else ""
inout = "input" if (name in self.inputs) else "output"
if (name in self.inputs) or (name in self.outputs):
math_name = name.upper()+" matrix" if (name in self.BuffersMatrix()) else name+" vector"
incld_description = "Leading dimension " if (name in self.BuffersMatrix()) else "Stride/increment "
a = ["`"+prefix+"cl_mem "+name+"_buffer`: OpenCL buffer to store the "+inout+" "+math_name+"."]
b = ["`const size_t "+name+"_offset`: The offset in elements from the start of the "+inout+" "+math_name+"."]
c = ["`const size_t "+name+"_"+self.Postfix(name)+"`: "+incld_description+"of the "+inout+" "+math_name+"."] if (name not in self.BuffersWithoutLdInc()) else []
return a+b+c
return []
# ==============================================================================================
# Retrieves the name of a scalar (alpha/beta)
@ -168,6 +258,14 @@ class Routine():
return [name]
return []
# Retrieves the use of a scalar for CBLAS (alpha/beta)
def ScalarUseWrapperC(self, name, flavour):
if name in self.scalars:
if flavour.IsComplex(name):
return [name+"_array.data()"]
return [name]
return []
# Retrieves the definition of a scalar (alpha/beta)
def ScalarDef(self, name, flavour):
if name in self.scalars:
@ -192,6 +290,14 @@ class Routine():
return ["const "+flavour.beta_cpp]
return []
# Retrieves the documentation of a scalar
def ScalarDoc(self, name):
if name in self.scalars:
if name == "alpha":
return ["`const "+self.template.alpha_cpp+" "+name+"`: Input scalar constant."]
return ["`const "+self.template.beta_cpp+" "+name+"`: Input scalar constant."]
return []
# ==============================================================================================
# Retrieves a list of comma-separated sizes (m, n, k)
@ -212,6 +318,13 @@ class Routine():
return [", ".join(["const size_t" for s in self.sizes])]
return []
# Retrieves the documentation of the sizes
def SizesDoc(self):
if self.sizes:
definitions = ["`const size_t "+s+"`: Integer size argument." for s in self.sizes]
return definitions
return []
# ==============================================================================================
# Retrieves a list of options
@ -235,9 +348,16 @@ class Routine():
return []
# As above, but now using clBLAS data-types
def OptionsDefWrapper(self):
def OptionsDefWrapperCL(self):
if self.options:
definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options]
definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options]
return [", ".join(definitions)]
return []
# As above, but now using CBLAS data-types
def OptionsDefWrapperC(self):
if self.options:
definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options]
return [", ".join(definitions)]
return []
@ -248,72 +368,129 @@ class Routine():
return [", ".join(definitions)]
return []
# Retrieves the documentation of the options
def OptionsDoc(self):
if self.options:
definitions = ["`const "+OptionToCLBlast(o)+"`: "+OptionToDoc(o) for o in self.options]
return definitions
return []
# ==============================================================================================
# Retrieves a combination of all the argument names, with Claduc casts
def ArgumentsCladuc(self, flavour, indent):
return (self.Options() + self.Sizes() + self.BufferCladuc("dot") +
return (self.Options() + self.Sizes() +
list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersFirst()])) +
self.Scalar("alpha") +
list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) +
self.Scalar("beta") +
list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) +
list(chain(*[self.Scalar(s) for s in ["d1","d2","a","b","c","s"]])))
list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument names, with CLBlast casts
def ArgumentsCast(self, flavour, indent):
return (self.OptionsCast(indent) + self.Sizes() + self.Buffer("dot") +
return (self.OptionsCast(indent) + self.Sizes() +
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarUse("alpha", flavour) +
list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
self.ScalarUse("beta", flavour) +
list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
list(chain(*[self.ScalarUse(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()])))
# As above, but for the clBLAS wrapper
def ArgumentsWrapper(self, flavour):
return (self.Options() + self.Sizes() + self.BufferWrapper("dot") +
def ArgumentsWrapperCL(self, flavour):
return (self.Options() + self.Sizes() +
list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarUseWrapper("alpha", flavour) +
list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) +
list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) +
self.ScalarUseWrapper("beta", flavour) +
list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) +
list(chain(*[self.ScalarUseWrapper(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()])))
# As above, but for the CBLAS wrapper
def ArgumentsWrapperC(self, flavour):
return (self.Options() + self.Sizes() +
self.ScalarUseWrapperC("alpha", flavour) +
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) +
self.ScalarUseWrapperC("beta", flavour) +
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument definitions
def ArgumentsDef(self, flavour):
return (self.OptionsDef() + self.SizesDef() + self.BufferDef("dot") +
return (self.OptionsDef() + self.SizesDef() +
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarDef("alpha", flavour) +
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
self.ScalarDef("beta", flavour) +
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
list(chain(*[self.ScalarDef(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()])))
# As above, but clBLAS wrapper plain datatypes
def ArgumentsDefWrapper(self, flavour):
return (self.OptionsDefWrapper() + self.SizesDef() + self.BufferDef("dot") +
def ArgumentsDefWrapperCL(self, flavour):
return (self.OptionsDefWrapperCL() + self.SizesDef() +
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarDefPlain("alpha", flavour) +
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
self.ScalarDefPlain("beta", flavour) +
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
list(chain(*[self.ScalarDefPlain(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
# As above, but CBLAS wrapper plain datatypes
def ArgumentsDefWrapperC(self, flavour):
return (self.OptionsDefWrapperC() + self.SizesDef() +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) +
self.ScalarDefPlain("alpha", flavour) +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) +
self.ScalarDefPlain("beta", flavour) +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument types
def ArgumentsType(self, flavour):
return (self.OptionsType() + self.SizesType() + self.BufferType("dot") +
return (self.OptionsType() + self.SizesType() +
list(chain(*[self.BufferType(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarType("alpha", flavour) +
list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) +
self.ScalarType("beta", flavour) +
list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) +
list(chain(*[self.ScalarType(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
list(chain(*[self.BufferType(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarType(s, flavour) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument types
def ArgumentsDoc(self):
return (self.OptionsDoc() + self.SizesDoc() +
list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) +
list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarDoc("alpha") +
list(chain(*[self.BufferDoc(b) for b in self.BuffersFirst()])) +
self.ScalarDoc("beta") +
list(chain(*[self.BufferDoc(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDoc(s) for s in self.OtherScalars()])))
# ==============================================================================================
# Retrieves a list of routine requirements for documentation
def RequirementsDoc(self):
return []
# ==============================================================================================
# Retrieves the C++ templated definition for a routine
def RoutineHeaderCPP(self, spaces):
def RoutineHeaderCPP(self, spaces, default_event):
indent = " "*(spaces + self.Length())
result = "template <"+self.template.name+">\n"
result += "StatusCode "+self.name.capitalize()+"("
result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)])
result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)"
result += ",\n"+indent+"cl_command_queue* queue, cl_event* event"+default_event+")"
return result
# As above, but now without variable names
@ -326,15 +503,15 @@ class Routine():
return result
# As above, but now for C
def RoutineHeaderC(self, flavour, spaces):
def RoutineHeaderC(self, flavour, spaces, extra_qualifier):
indent = " "*(spaces + self.Length())
result = "StatusCode CLBlast"+flavour.name+self.name+"("
result = "StatusCode"+extra_qualifier+" CLBlast"+flavour.name+self.name+"("
result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)])
result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)"
return result
# As above, but now for the clBLAS wrapper
def RoutineHeaderWrapper(self, flavour, def_only, spaces):
def RoutineHeaderWrapperCL(self, flavour, def_only, spaces):
template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
indent = " "*(spaces + self.Length() + len(template))
result = ""
@ -344,9 +521,16 @@ class Routine():
result += flavour.name
result += ">\n"
result += "clblasStatus clblasX"+self.name+template+"("
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)])
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)])
result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
return result
# As above, but now for the CBLAS wrapper
def RoutineHeaderWrapperC(self, flavour, def_only, spaces):
indent = " "*(spaces + self.Length())
result = "void cblasX"+self.name+"("
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")"
return result
# ==================================================================================================

113
src/cache.cc Normal file
View file

@ -0,0 +1,113 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the caching functionality of compiled binaries and programs.
//
// =================================================================================================
#include <string>
#include <vector>
#include <mutex>
#include "internal/cache.h"
namespace clblast {
namespace cache {
// =================================================================================================
// Stores the compiled binary or IR in the cache
void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
const Precision &precision, const std::string &routine_name) {
binary_cache_mutex_.lock();
binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
binary_cache_mutex_.unlock();
}
// Stores the compiled program in the cache
void StoreProgramToCache(const Program &program, const Context &context,
const Precision &precision, const std::string &routine_name) {
program_cache_mutex_.lock();
program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name});
program_cache_mutex_.unlock();
}
// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws
// otherwise.
const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
const std::string &routine_name) {
binary_cache_mutex_.lock();
for (auto &cached_binary: binary_cache_) {
if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
binary_cache_mutex_.unlock();
return cached_binary.binary;
}
}
binary_cache_mutex_.unlock();
throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none.");
}
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
// otherwise.
const Program& GetProgramFromCache(const Context &context, const Precision &precision,
const std::string &routine_name) {
program_cache_mutex_.lock();
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
program_cache_mutex_.unlock();
return cached_program.program;
}
}
program_cache_mutex_.unlock();
throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
}
// Queries the cache to see whether or not the compiled kernel is already there
bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
const std::string &routine_name) {
binary_cache_mutex_.lock();
for (auto &cached_binary: binary_cache_) {
if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
binary_cache_mutex_.unlock();
return true;
}
}
binary_cache_mutex_.unlock();
return false;
}
// Queries the cache to see whether or not the compiled kernel is already there
bool ProgramIsInCache(const Context &context, const Precision &precision,
const std::string &routine_name) {
program_cache_mutex_.lock();
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
program_cache_mutex_.unlock();
return true;
}
}
program_cache_mutex_.unlock();
return false;
}
// =================================================================================================
// Clears the cache of stored binaries and programs
StatusCode ClearCache() {
binary_cache_mutex_.lock();
binary_cache_.clear();
binary_cache_mutex_.unlock();
program_cache_mutex_.lock();
program_cache_.clear();
program_cache_mutex_.unlock();
return StatusCode::kSuccess;
}
// =================================================================================================
} // namespace cache
} // namespace clblast

File diff suppressed because it is too large Load diff

View file

@ -13,9 +13,7 @@
#include <string>
extern "C" {
#include "clblast_c.h"
}
#include "clblast_c.h"
#include "clblast.h"
#include "internal/utilities.h"
@ -27,6 +25,118 @@ using double2 = clblast::double2;
// BLAS level-1 (vector-vector) routines
// =================================================================================================
// ROTG
StatusCode CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset,
cl_mem sb_buffer, const size_t sb_offset,
cl_mem sc_buffer, const size_t sc_offset,
cl_mem ss_buffer, const size_t ss_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rotg<float>(sa_buffer, sa_offset,
sb_buffer, sb_offset,
sc_buffer, sc_offset,
ss_buffer, ss_offset,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
cl_mem sb_buffer, const size_t sb_offset,
cl_mem sc_buffer, const size_t sc_offset,
cl_mem ss_buffer, const size_t ss_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rotg<double>(sa_buffer, sa_offset,
sb_buffer, sb_offset,
sc_buffer, sc_offset,
ss_buffer, ss_offset,
queue, event);
return static_cast<StatusCode>(status);
}
// ROTMG
StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rotmg<float>(sd1_buffer, sd1_offset,
sd2_buffer, sd2_offset,
sx1_buffer, sx1_offset,
sy1_buffer, sy1_offset,
sparam_buffer, sparam_offset,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
cl_mem sd2_buffer, const size_t sd2_offset,
cl_mem sx1_buffer, const size_t sx1_offset,
const cl_mem sy1_buffer, const size_t sy1_offset,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rotmg<double>(sd1_buffer, sd1_offset,
sd2_buffer, sd2_offset,
sx1_buffer, sx1_offset,
sy1_buffer, sy1_offset,
sparam_buffer, sparam_offset,
queue, event);
return static_cast<StatusCode>(status);
}
// ROT
StatusCode CLBlastSrot(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const float cos,
const float sin,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rot(n,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
cos,
sin,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDrot(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
const double cos,
const double sin,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rot(n,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
cos,
sin,
queue, event);
return static_cast<StatusCode>(status);
}
// ROTM
StatusCode CLBlastSrotm(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rotm<float>(n,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
sparam_buffer, sparam_offset,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDrotm(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Rotm<double>(n,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
sparam_buffer, sparam_offset,
queue, event);
return static_cast<StatusCode>(status);
}
// SWAP
StatusCode CLBlastSswap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -281,6 +391,258 @@ StatusCode CLBlastZdotc(const size_t n,
return static_cast<StatusCode>(status);
}
// NRM2
StatusCode CLBlastSnrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Nrm2<float>(n,
nrm2_buffer, nrm2_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDnrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Nrm2<double>(n,
nrm2_buffer, nrm2_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastScnrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Nrm2<float2>(n,
nrm2_buffer, nrm2_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDznrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Nrm2<double2>(n,
nrm2_buffer, nrm2_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// ASUM
StatusCode CLBlastSasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Asum<float>(n,
asum_buffer, asum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Asum<double>(n,
asum_buffer, asum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastScasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Asum<float2>(n,
asum_buffer, asum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDzasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Asum<double2>(n,
asum_buffer, asum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// SUM
StatusCode CLBlastSsum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Sum<float>(n,
sum_buffer, sum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDsum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Sum<double>(n,
sum_buffer, sum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastScsum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Sum<float2>(n,
sum_buffer, sum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastDzsum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Sum<double2>(n,
sum_buffer, sum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// AMAX
StatusCode CLBlastiSamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Amax<float>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiDamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Amax<double>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiCamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Amax<float2>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiZamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Amax<double2>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// MAX
StatusCode CLBlastiSmax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Max<float>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiDmax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Max<double>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiCmax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Max<float2>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiZmax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Max<double2>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// MIN
StatusCode CLBlastiSmin(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Min<float>(n,
imin_buffer, imin_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiDmin(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Min<double>(n,
imin_buffer, imin_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiCmin(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Min<float2>(n,
imin_buffer, imin_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiZmin(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Min<double2>(n,
imin_buffer, imin_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
@ -2022,3 +2384,15 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri
}
// =================================================================================================
// Clears the cache of stored binaries
StatusCode CLBlastClearCache() {
return static_cast<StatusCode>(clblast::ClearCache());
}
// Fills the cache with binaries for a specific device
StatusCode CLBlastFillCache(const cl_device_id device) {
return static_cast<StatusCode>(clblast::FillCache(device));
}
// =================================================================================================

View file

@ -40,6 +40,7 @@ R"(
typedef float16 real16;
#define ZERO 0.0f
#define ONE 1.0f
#define SMALLEST -1.0e37f
// Double-precision
#elif PRECISION == 64
@ -50,6 +51,7 @@ R"(
typedef double16 real16;
#define ZERO 0.0
#define ONE 1.0
#define SMALLEST -1.0e37
// Complex single-precision
#elif PRECISION == 3232
@ -64,6 +66,7 @@ R"(
real sC; real sD; real sE; real sF;} real16;
#define ZERO 0.0f
#define ONE 1.0f
#define SMALLEST -1.0e37f
// Complex Double-precision
#elif PRECISION == 6464
@ -78,6 +81,16 @@ R"(
real sC; real sD; real sE; real sF;} real16;
#define ZERO 0.0
#define ONE 1.0
#define SMALLEST -1.0e37
#endif
// Single-element version of a complex number
#if PRECISION == 3232
typedef float singlereal;
#elif PRECISION == 6464
typedef double singlereal;
#else
typedef real singlereal;
#endif
// =================================================================================================
@ -109,6 +122,13 @@ R"(
#define SetToOne(a) a = ONE
#endif
// The absolute value (component-wise)
#if PRECISION == 3232 || PRECISION == 6464
#define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)
#else
#define AbsoluteValue(value) value = fabs(value)
#endif
// Adds two complex variables
#if PRECISION == 3232 || PRECISION == 6464
#define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y

View file

@ -0,0 +1,140 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xamax kernel. It implements an index of absolute max computation using
// reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is
// loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
// is executed with a single workgroup only, computing the final result.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef WGS1
#define WGS1 64 // The local work-group size of the main kernel
#endif
#ifndef WGS2
#define WGS2 64 // The local work-group size of the epilogue kernel
#endif
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xamax(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global singlereal* maxgm, __global unsigned int* imaxgm) {
__local singlereal maxlm[WGS1];
__local unsigned int imaxlm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
const int num_groups = get_num_groups(0);
// Performs loading and the first steps of the reduction
#if defined(ROUTINE_MAX) || defined(ROUTINE_MIN) // non-absolute version
singlereal max = SMALLEST;
#else
singlereal max = ZERO;
#endif
unsigned int imax = 0;
int id = wgid*WGS1 + lid;
while (id < n) {
const int x_index = id*x_inc + x_offset;
#if PRECISION == 3232 || PRECISION == 6464
singlereal x = xgm[x_index].x;
#else
singlereal x = xgm[x_index];
#endif
#if defined(ROUTINE_MAX) // non-absolute maximum version
// nothing special here
#elif defined(ROUTINE_MIN) // non-absolute minimum version
x = -x;
#else
x = fabs(x);
#endif
if (x >= max) {
max = x;
imax = id*x_inc + x_offset;
}
id += WGS1*num_groups;
}
maxlm[lid] = max;
imaxlm[lid] = imax;
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
#pragma unroll
for (int s=WGS1/2; s>0; s=s>>1) {
if (lid < s) {
if (maxlm[lid + s] >= maxlm[lid]) {
maxlm[lid] = maxlm[lid + s];
imaxlm[lid] = imaxlm[lid + s];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the per-workgroup result
if (lid == 0) {
maxgm[wgid] = maxlm[0];
imaxgm[wgid] = imaxlm[0];
}
}
// =================================================================================================
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
const __global unsigned int* restrict imaxgm,
__global unsigned int* imax, const int imax_offset) {
__local singlereal maxlm[WGS2];
__local unsigned int imaxlm[WGS2];
const int lid = get_local_id(0);
// Performs the first step of the reduction while loading the data
if (maxgm[lid + WGS2] >= maxgm[lid]) {
maxlm[lid] = maxgm[lid + WGS2];
imaxlm[lid] = imaxgm[lid + WGS2];
}
else {
maxlm[lid] = maxgm[lid];
imaxlm[lid] = imaxgm[lid];
}
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
#pragma unroll
for (int s=WGS2/2; s>0; s=s>>1) {
if (lid < s) {
if (maxlm[lid + s] >= maxlm[lid]) {
maxlm[lid] = maxlm[lid + s];
imaxlm[lid] = imaxlm[lid + s];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the final result
if (lid == 0) {
imax[imax_offset] = imaxlm[0];
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -0,0 +1,111 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xasum kernel. It implements a absolute sum computation using reduction
// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is loaded,
// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
// is executed with a single workgroup only, computing the final result.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef WGS1
#define WGS1 64 // The local work-group size of the main kernel
#endif
#ifndef WGS2
#define WGS2 64 // The local work-group size of the epilogue kernel
#endif
// =================================================================================================
// The main reduction kernel, performing the loading and the majority of the operation
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xasum(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
__local real lm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
const int num_groups = get_num_groups(0);
// Performs loading and the first steps of the reduction
real acc;
SetToZero(acc);
int id = wgid*WGS1 + lid;
while (id < n) {
real x = xgm[id*x_inc + x_offset];
#if defined(ROUTINE_SUM) // non-absolute version
#else
AbsoluteValue(x);
#endif
Add(acc, acc, x);
id += WGS1*num_groups;
}
lm[lid] = acc;
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
#pragma unroll
for (int s=WGS1/2; s>0; s=s>>1) {
if (lid < s) {
Add(lm[lid], lm[lid], lm[lid + s]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the per-workgroup result
if (lid == 0) {
output[wgid] = lm[0];
}
}
// =================================================================================================
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XasumEpilogue(const __global real* restrict input,
__global real* asum, const int asum_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);
// Performs the first step of the reduction while loading the data
Add(lm[lid], input[lid], input[lid + WGS2]);
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
#pragma unroll
for (int s=WGS2/2; s>0; s=s>>1) {
if (lid < s) {
Add(lm[lid], lm[lid], lm[lid + s]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Computes the absolute value and stores the final result
if (lid == 0) {
#if PRECISION == 3232 || PRECISION == 6464
asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number
#else
asum[asum_offset] = lm[0];
#endif
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -30,7 +30,8 @@ __kernel void Xaxpy(const int n, const real alpha,
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]);
real xvalue = xgm[id*x_inc + x_offset];
MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xvalue);
}
}
@ -45,7 +46,9 @@ __kernel void XaxpyFast(const int n, const real alpha,
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id = w*get_global_size(0) + get_global_id(0);
ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]);
realV xvalue = xgm[id];
realV yvalue = ygm[id];
ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
}
}

View file

@ -0,0 +1,109 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xnrm2 kernel. It implements a squared norm computation using reduction
// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is squared,
// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
// is executed with a single workgroup only, computing the final result.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef WGS1
#define WGS1 64 // The local work-group size of the main kernel
#endif
#ifndef WGS2
#define WGS2 64 // The local work-group size of the epilogue kernel
#endif
// =================================================================================================
// The main reduction kernel, performing the multiplication and the majority of the operation
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xnrm2(const int n,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* output) {
__local real lm[WGS1];
const int lid = get_local_id(0);
const int wgid = get_group_id(0);
const int num_groups = get_num_groups(0);
// Performs multiplication and the first steps of the reduction
real acc;
SetToZero(acc);
int id = wgid*WGS1 + lid;
while (id < n) {
real x1 = xgm[id*x_inc + x_offset];
real x2 = x1;
COMPLEX_CONJUGATE(x2);
MultiplyAdd(acc, x1, x2);
id += WGS1*num_groups;
}
lm[lid] = acc;
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
#pragma unroll
for (int s=WGS1/2; s>0; s=s>>1) {
if (lid < s) {
Add(lm[lid], lm[lid], lm[lid + s]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Stores the per-workgroup result
if (lid == 0) {
output[wgid] = lm[0];
}
}
// =================================================================================================
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void Xnrm2Epilogue(const __global real* restrict input,
__global real* nrm2, const int nrm2_offset) {
__local real lm[WGS2];
const int lid = get_local_id(0);
// Performs the first step of the reduction while loading the data
Add(lm[lid], input[lid], input[lid + WGS2]);
barrier(CLK_LOCAL_MEM_FENCE);
// Performs reduction in local memory
#pragma unroll
for (int s=WGS2/2; s>0; s=s>>1) {
if (lid < s) {
Add(lm[lid], lm[lid], lm[lid + s]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Computes the square root and stores the final result
if (lid == 0) {
#if PRECISION == 3232 || PRECISION == 6464
nrm2[nrm2_offset].x = sqrt(lm[0].x); // the result is a non-complex number
#else
nrm2[nrm2_offset] = sqrt(lm[0]);
#endif
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View file

@ -29,8 +29,9 @@ __kernel void Xscal(const int n, const real alpha,
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
real xvalue = xgm[id*x_inc + x_offset];
real result;
Multiply(result, alpha, xgm[id*x_inc + x_offset]);
Multiply(result, alpha, xvalue);
xgm[id*x_inc + x_offset] = result;
}
}
@ -45,8 +46,9 @@ __kernel void XscalFast(const int n, const real alpha,
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id = w*get_global_size(0) + get_global_id(0);
realV xvalue = xgm[id];
realV result;
result = MultiplyVector(result, alpha, xgm[id]);
result = MultiplyVector(result, alpha, xvalue);
xgm[id] = result;
}
}

View file

@ -11,18 +11,17 @@
//
// =================================================================================================
#include <string>
#include <vector>
#include "internal/routine.h"
namespace clblast {
// =================================================================================================
// The cache of compiled OpenCL programs
template <typename T>
std::vector<typename Routine<T>::ProgramCache> Routine<T>::program_cache_;
// Constructor: not much here, because no status codes can be returned
template <typename T>
Routine<T>::Routine(Queue &queue, Event &event, const std::string &name,
Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision):
precision_(precision),
routine_name_(name),
@ -43,65 +42,81 @@ Routine<T>::Routine(Queue &queue, Event &event, const std::string &name,
template <typename T>
StatusCode Routine<T>::SetUp() {
// Queries the cache to see whether or not the compiled kernel is already there. If not, it will
// be built and added to the cache.
if (!ProgramIsInCache()) {
// Queries the cache to see whether or not the program (context-specific) is already there
if (ProgramIsInCache()) { return StatusCode::kSuccess; }
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
auto extensions = device_.Capabilities();
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
return StatusCode::kNoDoublePrecision;
}
}
// As above, but for cl_khr_fp16 (half precision)
if (precision_ == Precision::kHalf) {
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
return StatusCode::kNoHalfPrecision;
}
}
// Loads the common header (typedefs and defines and such)
std::string common_header =
#include "kernels/common.opencl"
;
// Collects the parameters for this device in the form of defines, and adds the precision
auto defines = db_.GetDefines();
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
// Adds the name of the routine as a define
defines += "#define ROUTINE_"+routine_name_+"\n";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.Vendor() == "AMD") {
defines += "#define USE_CL_MAD 1\n";
}
// Combines everything together into a single source string
auto source_string = defines + common_header + source_string_;
// Compiles the kernel
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
if (BinaryIsInCache()) {
try {
auto program = Program(context_, source_string);
auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
auto options = std::vector<std::string>();
auto build_status = program.Build(device_, options);
// Checks for compiler crashes/errors/warnings
if (build_status == BuildStatus::kError) {
auto message = program.GetBuildInfo(device_);
fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
return StatusCode::kBuildProgramFailure;
}
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
// Store the compiled program in the cache
program_cache_.push_back({program, device_name_, precision_, routine_name_});
program.Build(device_, options);
StoreProgramToCache(program);
} catch (...) { return StatusCode::kBuildProgramFailure; }
return StatusCode::kSuccess;
}
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
// program will be added to the cache.
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
auto extensions = device_.Capabilities();
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
return StatusCode::kNoDoublePrecision;
}
}
// As above, but for cl_khr_fp16 (half precision)
if (precision_ == Precision::kHalf) {
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
return StatusCode::kNoHalfPrecision;
}
}
// Loads the common header (typedefs and defines and such)
std::string common_header =
#include "kernels/common.opencl"
;
// Collects the parameters for this device in the form of defines, and adds the precision
auto defines = db_.GetDefines();
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
// Adds the name of the routine as a define
defines += "#define ROUTINE_"+routine_name_+"\n";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.Vendor() == "AMD") {
defines += "#define USE_CL_MAD 1\n";
}
// Combines everything together into a single source string
auto source_string = defines + common_header + source_string_;
// Compiles the kernel
try {
auto program = Program(context_, source_string);
auto options = std::vector<std::string>();
auto build_status = program.Build(device_, options);
// Checks for compiler crashes/errors/warnings
if (build_status == BuildStatus::kError) {
auto message = program.GetBuildInfo(device_);
fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
return StatusCode::kBuildProgramFailure;
}
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
// Store the compiled binary and program in the cache
const auto binary = program.GetIR();
StoreBinaryToCache(binary);
StoreProgramToCache(program);
} catch (...) { return StatusCode::kBuildProgramFailure; }
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
@ -111,7 +126,8 @@ StatusCode Routine<T>::SetUp() {
// Enqueues a kernel, waits for completion, and checks for errors
template <typename T>
StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
const std::vector<size_t> &local) {
const std::vector<size_t> &local, EventPointer event,
std::vector<Event>& waitForEvents) {
// Tests for validity of the local thread sizes
if (local.size() > max_work_item_dimensions_) {
@ -135,18 +151,21 @@ StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
// Launches the kernel (and checks for launch errors)
try {
kernel.Launch(queue_, global, local, event_);
kernel.Launch(queue_, global, local, event, waitForEvents);
} catch (...) { return StatusCode::kKernelLaunchError; }
// Waits for completion of the kernel
try {
queue_.Finish(event_);
} catch (...) { return StatusCode::kKernelRunError; }
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// As above, but without an event waiting list
template <typename T>
StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event) {
auto emptyWaitingList = std::vector<Event>();
return RunKernel(kernel, global, local, event, emptyWaitingList);
}
// =================================================================================================
// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
@ -156,7 +175,7 @@ StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buf
const size_t offset, const size_t ld, const size_t data_size) {
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
try {
auto required_size = (ld*two + offset)*data_size;
auto required_size = (ld*(two-1) + one + offset)*data_size;
auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
@ -170,7 +189,7 @@ StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buf
const size_t offset, const size_t ld, const size_t data_size) {
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
try {
auto required_size = (ld*two + offset)*data_size;
auto required_size = (ld*(two-1) + one + offset)*data_size;
auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
} catch (...) { return StatusCode::kInvalidMatrixB; }
@ -184,7 +203,7 @@ StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buf
const size_t offset, const size_t ld, const size_t data_size) {
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
try {
auto required_size = (ld*two + offset)*data_size;
auto required_size = (ld*(two-1) + one + offset)*data_size;
auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
} catch (...) { return StatusCode::kInvalidMatrixC; }
@ -212,7 +231,7 @@ StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, cons
const size_t inc, const size_t data_size) {
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
try {
auto required_size = (n*inc + offset)*data_size;
auto required_size = ((n-1)*inc + 1 + offset)*data_size;
auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
} catch (...) { return StatusCode::kInvalidVectorX; }
@ -226,7 +245,7 @@ StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, cons
const size_t inc, const size_t data_size) {
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
try {
auto required_size = (n*inc + offset)*data_size;
auto required_size = ((n-1)*inc + 1 + offset)*data_size;
auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
} catch (...) { return StatusCode::kInvalidVectorY; }
@ -248,11 +267,25 @@ StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, co
return StatusCode::kSuccess;
}
// Tests vector index for validity: checks for a valid increment, a valid OpenCL buffer, and for a
// sufficient buffer size.
template <typename T>
StatusCode Routine<T>::TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
const size_t offset, const size_t data_size) {
try {
auto required_size = (n + offset)*data_size;
auto buffer_size = buffer.GetSize();
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
} catch (...) { return StatusCode::kInvalidVectorDot; }
return StatusCode::kSuccess;
}
// =================================================================================================
// Copies or transposes a matrix and pads/unpads it with zeros
template <typename T>
StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
@ -334,13 +367,13 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t
auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"],
dest_two / db_["TRA_WPT"]};
auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event, waitForEvents);
}
else {
auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])};
auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event, waitForEvents);
}
}
else {
@ -348,13 +381,13 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t
auto global = std::vector<size_t>{dest_one / db_["COPY_VW"],
dest_two / db_["COPY_WPT"]};
auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event, waitForEvents);
}
else {
auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event, waitForEvents);
}
}
return status;
@ -363,29 +396,6 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t
// =================================================================================================
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
// otherwise.
template <typename T>
const Program& Routine<T>::GetProgramFromCache() const {
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) {
return cached_program.program;
}
}
throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
}
// Queries the cache to see whether or not the compiled kernel is already there
template <typename T>
bool Routine<T>::ProgramIsInCache() const {
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; }
}
return false;
}
// =================================================================================================
// Compiles the templated class
template class Routine<float>;
template class Routine<double>;

View file

@ -0,0 +1,112 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xamax class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level1/xamax.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xamax<float>::precision_ = Precision::kSingle;
template <> const Precision Xamax<double>::precision_ = Precision::kDouble;
template <> const Precision Xamax<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xamax<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
source_string_ =
#include "../../kernels/level1/xamax.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xamax<T>::DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestVectorIndex(1, imax_buffer, imax_offset, sizeof(unsigned int));
if (ErrorIn(status)) { return status; }
// Retrieves the Xamax kernels from the compiled binary
try {
const auto program = GetProgramFromCache();
auto kernel1 = Kernel(program, "Xamax");
auto kernel2 = Kernel(program, "XamaxEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer1 = Buffer<T>(context_, temp_size);
auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer1());
kernel1.SetArgument(5, temp_buffer2());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer1());
kernel2.SetArgument(1, temp_buffer2());
kernel2.SetArgument(2, imax_buffer());
kernel2.SetArgument(3, static_cast<int>(imax_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
// Compiles the templated class
template class Xamax<float>;
template class Xamax<double>;
template class Xamax<float2>;
template class Xamax<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -0,0 +1,109 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xasum class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level1/xasum.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xasum<float>::precision_ = Precision::kSingle;
template <> const Precision Xasum<double>::precision_ = Precision::kDouble;
template <> const Precision Xasum<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xasum<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
source_string_ =
#include "../../kernels/level1/xasum.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xasum<T>::DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestVectorDot(1, asum_buffer, asum_offset, sizeof(T));
if (ErrorIn(status)) { return status; }
// Retrieves the Xasum kernels from the compiled binary
try {
const auto program = GetProgramFromCache();
auto kernel1 = Kernel(program, "Xasum");
auto kernel2 = Kernel(program, "XasumEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, asum_buffer());
kernel2.SetArgument(2, static_cast<int>(asum_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
// Compiles the templated class
template class Xasum<float>;
template class Xasum<double>;
template class Xasum<float2>;
template class Xasum<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -29,7 +29,7 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
Xaxpy<T>::Xaxpy(Queue &queue, Event &event, const std::string &name):
Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
@ -64,7 +64,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
// Retrieves the Xaxpy kernel from the compiled binary
try {
auto& program = GetProgramFromCache();
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
@ -89,19 +89,16 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }

View file

@ -29,7 +29,7 @@ template <> const Precision Xcopy<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
Xcopy<T>::Xcopy(Queue &queue, Event &event, const std::string &name):
Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
@ -64,7 +64,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
// Retrieves the Xcopy kernel from the compiled binary
try {
auto& program = GetProgramFromCache();
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
@ -87,19 +87,16 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }

View file

@ -29,7 +29,7 @@ template <> const Precision Xdot<double2>::precision_ = Precision::kComplexDoubl
// Constructor: forwards to base class constructor
template <typename T>
Xdot<T>::Xdot(Queue &queue, Event &event, const std::string &name):
Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
source_string_ =
#include "../../kernels/level1/xdot.opencl"
@ -59,7 +59,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
// Retrieves the Xdot kernels from the compiled binary
try {
auto& program = GetProgramFromCache();
const auto program = GetProgramFromCache();
auto kernel1 = Kernel(program, "Xdot");
auto kernel2 = Kernel(program, "XdotEpilogue");
@ -78,11 +78,16 @@ StatusCode Xdot<T>::DoDot(const size_t n,
kernel1.SetArgument(7, temp_buffer());
kernel1.SetArgument(8, static_cast<int>(do_conjugate));
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
status = RunKernel(kernel1, global1, local1);
auto kernelEvent = Event();
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
@ -92,12 +97,9 @@ StatusCode Xdot<T>::DoDot(const size_t n,
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, global2, local2);
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }

View file

@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xdotc<T>::Xdotc(Queue &queue, Event &event, const std::string &name):
Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):
Xdot<T>(queue, event, name) {
}

View file

@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xdotu<T>::Xdotu(Queue &queue, Event &event, const std::string &name):
Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):
Xdot<T>(queue, event, name) {
}

View file

@ -0,0 +1,109 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the Xnrm2 class (see the header for information about the class).
//
// =================================================================================================
#include "internal/routines/level1/xnrm2.h"
#include <string>
#include <vector>
namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xnrm2<float>::precision_ = Precision::kSingle;
template <> const Precision Xnrm2<double>::precision_ = Precision::kDouble;
template <> const Precision Xnrm2<float2>::precision_ = Precision::kComplexSingle;
template <> const Precision Xnrm2<double2>::precision_ = Precision::kComplexDouble;
// =================================================================================================
// Constructor: forwards to base class constructor
template <typename T>
Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
source_string_ =
#include "../../kernels/level1/xnrm2.opencl"
;
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xnrm2<T>::DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
status = TestVectorDot(1, nrm2_buffer, nrm2_offset, sizeof(T));
if (ErrorIn(status)) { return status; }
// Retrieves the Xnrm2 kernels from the compiled binary
try {
const auto program = GetProgramFromCache();
auto kernel1 = Kernel(program, "Xnrm2");
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, nrm2_buffer());
kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
}
// =================================================================================================
// Compiles the templated class
template class Xnrm2<float>;
template class Xnrm2<double>;
template class Xnrm2<float2>;
template class Xnrm2<double2>;
// =================================================================================================
} // namespace clblast

View file

@ -29,7 +29,7 @@ template <> const Precision Xscal<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
Xscal<T>::Xscal(Queue &queue, Event &event, const std::string &name):
Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
@ -60,7 +60,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
// Retrieves the Xscal kernel from the compiled binary
try {
auto& program = GetProgramFromCache();
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
@ -81,19 +81,16 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }

View file

@ -29,7 +29,7 @@ template <> const Precision Xswap<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
Xswap<T>::Xswap(Queue &queue, Event &event, const std::string &name):
Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
source_string_ =
#include "../../kernels/level1/level1.opencl"
@ -64,7 +64,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
// Retrieves the Xswap kernel from the compiled binary
try {
auto& program = GetProgramFromCache();
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
@ -87,19 +87,16 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }

View file

@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name):
Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}

View file

@ -29,7 +29,7 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
source_string_ =
#include "../../kernels/level2/xgemv.opencl"
@ -136,7 +136,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
// Retrieves the Xgemv kernel from the compiled binary
try {
auto& program = GetProgramFromCache();
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
@ -162,12 +162,9 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
// Launches the kernel
auto global = std::vector<size_t>{global_size};
auto local = std::vector<size_t>{local_size};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event_);
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }

View file

@ -29,7 +29,7 @@ template <> const Precision Xger<double2>::precision_ = Precision::kComplexDoubl
// Constructor: forwards to base class constructor
template <typename T>
Xger<T>::Xger(Queue &queue, Event &event, const std::string &name):
Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
Routine<T>(queue, event, name, {"Xger"}, precision_) {
source_string_ =
#include "../../kernels/level2/level2.opencl"
@ -66,7 +66,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
// Retrieves the Xgemv kernel from the compiled binary
try {
auto& program = GetProgramFromCache();
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, "Xger");
// Sets the kernel arguments
@ -89,12 +89,9 @@ StatusCode Xger<T>::DoGer(const Layout layout,
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, global, local);
status = RunKernel(kernel, global, local, event_);
if (ErrorIn(status)) { return status; }
// Waits for all kernels to finish
queue_.Finish();
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }

View file

@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgerc<T>::Xgerc(Queue &queue, Event &event, const std::string &name):
Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name):
Xger<T>(queue, event, name) {
}

View file

@ -20,7 +20,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgeru<T>::Xgeru(Queue &queue, Event &event, const std::string &name):
Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name):
Xger<T>(queue, event, name) {
}

View file

@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name):
Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}

Some files were not shown because too many files have changed in this diff Show more