mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-08-27 23:37:02 +02:00
Merge pull request #58 from CNugteren/development
Update to version 0.7.0
This commit is contained in:
commit
d91356a6b7
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -3,3 +3,4 @@ stash
|
||||||
.*
|
.*
|
||||||
*.pyc
|
*.pyc
|
||||||
*.db
|
*.db
|
||||||
|
cl.hpp
|
19
CHANGELOG
19
CHANGELOG
|
@ -1,4 +1,23 @@
|
||||||
|
|
||||||
|
Version 0.7.0
|
||||||
|
- Added exports to be able to create a DLL on Windows (thanks to Marco Hutter)
|
||||||
|
- Made the library thread-safe
|
||||||
|
- Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries
|
||||||
|
- Fixed the use of events within the library
|
||||||
|
- Changed the enum parameters to match the raw values of the cblas standard
|
||||||
|
- Fixed the cache of previously compiled binaries and added a function to fill or clear it
|
||||||
|
- Various minor fixes and enhancements
|
||||||
|
- Added a preliminary version of the API documentation
|
||||||
|
- Added additional sample programs
|
||||||
|
- Added tuned parameters for various devices (see README)
|
||||||
|
- Added level-1 routines:
|
||||||
|
* SNRM2/DNRM2/ScNRM2/DzNRM2
|
||||||
|
* SASUM/DASUM/ScASUM/DzASUM
|
||||||
|
* SSUM/DSUM/ScSUM/DzSUM (non-absolute version of the above xASUM BLAS routines)
|
||||||
|
* iSAMAX/iDAMAX/iCAMAX/iZAMAX
|
||||||
|
* iSMAX/iDMAX/iCMAX/iZMAX (non-absolute version of the above ixAMAX BLAS routines)
|
||||||
|
* iSMIN/iDMIN/iCMIN/iZMIN (non-absolute minimum version of the above ixAMAX BLAS routines)
|
||||||
|
|
||||||
Version 0.6.0
|
Version 0.6.0
|
||||||
- Added support for MSVC (Visual Studio) 2015
|
- Added support for MSVC (Visual Studio) 2015
|
||||||
- Added tuned parameters for various devices (see README)
|
- Added tuned parameters for various devices (see README)
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
cmake_minimum_required(VERSION 2.8.10)
|
cmake_minimum_required(VERSION 2.8.10)
|
||||||
project("clblast" C CXX)
|
project("clblast" C CXX)
|
||||||
set(clblast_VERSION_MAJOR 0)
|
set(clblast_VERSION_MAJOR 0)
|
||||||
set(clblast_VERSION_MINOR 6)
|
set(clblast_VERSION_MINOR 7)
|
||||||
set(clblast_VERSION_PATCH 0)
|
set(clblast_VERSION_PATCH 0)
|
||||||
|
|
||||||
# Options and their default values
|
# Options and their default values
|
||||||
|
@ -66,13 +66,22 @@ else ()
|
||||||
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
|
set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
|
||||||
endif()
|
endif()
|
||||||
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
||||||
set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
|
set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
|
||||||
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
|
set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
|
||||||
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
|
set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
|
||||||
|
set(FLAGS "${FLAGS} -Wno-deprecated-declarations")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
|
||||||
|
|
||||||
|
# C compiler settings (for the sample)
|
||||||
|
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||||
|
set(CFLAGS "/Ox")
|
||||||
|
else ()
|
||||||
|
set(CFLAGS "-O3 -std=c99")
|
||||||
|
endif()
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}")
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Package scripts location
|
# Package scripts location
|
||||||
|
@ -90,11 +99,13 @@ if(TUNERS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Locates the clBLAS library in case the tests need to be compiled. "FindclBLAS.cmake" is included.
|
# Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
|
||||||
|
# and "FindCBLAS.cmake" are included.
|
||||||
if(TESTS)
|
if(TESTS)
|
||||||
find_package(clBLAS)
|
find_package(clBLAS)
|
||||||
if(NOT CLBLAS_FOUND)
|
find_package(CBLAS)
|
||||||
message(STATUS "Could NOT find clBLAS, disabling the compilation of the tests")
|
if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
|
||||||
|
message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
|
||||||
set(TESTS OFF)
|
set(TESTS OFF)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -109,8 +120,8 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
|
||||||
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
|
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
|
||||||
set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
|
set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
|
||||||
set(SAMPLE_PROGRAMS_CPP sgemm)
|
set(SAMPLE_PROGRAMS_CPP sgemm)
|
||||||
set(SAMPLE_PROGRAMS_C sgemm)
|
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm cache)
|
||||||
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc)
|
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
|
||||||
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
|
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
|
||||||
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
|
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
|
||||||
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
|
set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
|
||||||
|
@ -120,7 +131,8 @@ set(PRECISIONS 32 64 3232 6464)
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Gathers all source-files
|
# Gathers all source-files
|
||||||
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/utilities.cc src/clblast_c.cc)
|
set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/cache.cc
|
||||||
|
src/utilities.cc src/clblast_c.cc)
|
||||||
foreach(ROUTINE ${LEVEL1_ROUTINES})
|
foreach(ROUTINE ${LEVEL1_ROUTINES})
|
||||||
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
|
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
@ -156,6 +168,9 @@ endif()
|
||||||
# This section contains all the code related to the examples
|
# This section contains all the code related to the examples
|
||||||
if(SAMPLES)
|
if(SAMPLES)
|
||||||
|
|
||||||
|
# Downloads the cl.hpp file from Khronos
|
||||||
|
file(DOWNLOAD https://www.khronos.org/registry/cl/api/1.1/cl.hpp ${clblast_SOURCE_DIR}/samples/cl.hpp)
|
||||||
|
|
||||||
# Adds sample programs (C++)
|
# Adds sample programs (C++)
|
||||||
foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
|
foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
|
||||||
add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc)
|
add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc)
|
||||||
|
@ -204,11 +219,33 @@ endif()
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Down from here is all test (performance and correctness) related. Note that these tests require
|
# Down from here is all test (performance and correctness) related. Note that these tests require
|
||||||
# the presence of the clBLAS library to act as a reference.
|
# the presence of clBLAS and/or a BLAS library to act as a reference.
|
||||||
if(TESTS)
|
if(TESTS)
|
||||||
|
|
||||||
# Adds new include directories for the reference clBLAS
|
# Sets the specifics for the reference BLAS libraries
|
||||||
include_directories(${clblast_SOURCE_DIR}/test ${CLBLAS_INCLUDE_DIRS})
|
set(REF_INCLUDES )
|
||||||
|
set(REF_LIBRARIES )
|
||||||
|
if(CLBLAS_FOUND)
|
||||||
|
set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
|
||||||
|
set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
|
||||||
|
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||||
|
add_definitions(" /DCLBLAST_REF_CLBLAS")
|
||||||
|
else()
|
||||||
|
add_definitions(" -DCLBLAST_REF_CLBLAS")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
if(CBLAS_FOUND)
|
||||||
|
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
|
||||||
|
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
|
||||||
|
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||||
|
add_definitions(" /DCLBLAST_REF_CBLAS")
|
||||||
|
else()
|
||||||
|
add_definitions(" -DCLBLAST_REF_CBLAS")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Sets the include directories
|
||||||
|
include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})
|
||||||
|
|
||||||
# Creates the common correctness-tests objects (requires CMake 2.8.8)
|
# Creates the common correctness-tests objects (requires CMake 2.8.8)
|
||||||
add_library(test_correctness_common OBJECT
|
add_library(test_correctness_common OBJECT
|
||||||
|
@ -228,7 +265,7 @@ if(TESTS)
|
||||||
test/correctness/routines/level3/${ROUTINE}.cc)
|
test/correctness/routines/level3/${ROUTINE}.cc)
|
||||||
endforeach()
|
endforeach()
|
||||||
foreach(ROUTINE ${ROUTINES})
|
foreach(ROUTINE ${ROUTINES})
|
||||||
target_link_libraries(clblast_test_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
|
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
|
||||||
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
|
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
|
@ -258,7 +295,7 @@ if(TESTS)
|
||||||
test/performance/routines/level3/${ROUTINE}.cc)
|
test/performance/routines/level3/${ROUTINE}.cc)
|
||||||
endforeach()
|
endforeach()
|
||||||
foreach(ROUTINE ${ROUTINES})
|
foreach(ROUTINE ${ROUTINES})
|
||||||
target_link_libraries(clblast_client_${ROUTINE} clblast ${CLBLAS_LIBRARIES} ${OPENCL_LIBRARIES})
|
target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
|
||||||
install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
|
install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
|
|
155
README.md
155
README.md
|
@ -52,6 +52,14 @@ The pre-requisites for compilation of CLBlast are:
|
||||||
- Intel OpenCL
|
- Intel OpenCL
|
||||||
- Beignet
|
- Beignet
|
||||||
|
|
||||||
|
Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either:
|
||||||
|
|
||||||
|
* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS (maintained by AMD)
|
||||||
|
* A regular CPU Netlib BLAS library, e.g.:
|
||||||
|
- OpenBLAS
|
||||||
|
- BLIS
|
||||||
|
- Accelerate
|
||||||
|
|
||||||
An example of an out-of-source build (starting from the root of the CLBlast folder):
|
An example of an out-of-source build (starting from the root of the CLBlast folder):
|
||||||
|
|
||||||
mkdir build
|
mkdir build
|
||||||
|
@ -76,7 +84,7 @@ Or alternatively the plain C version:
|
||||||
|
|
||||||
#include <clblast_c.h>
|
#include <clblast_c.h>
|
||||||
|
|
||||||
Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file. Additionally, a couple of stand-alone example programs are included in `samples/`.
|
Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/api.md). Additionally, a couple of stand-alone example programs are included in `samples/`.
|
||||||
|
|
||||||
|
|
||||||
Using the tuners (optional)
|
Using the tuners (optional)
|
||||||
|
@ -95,6 +103,8 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
|
||||||
- Tesla K40m
|
- Tesla K40m
|
||||||
* AMD GPUs:
|
* AMD GPUs:
|
||||||
- Tahiti
|
- Tahiti
|
||||||
|
- Hawaii
|
||||||
|
- Pitcairn
|
||||||
- R9 M370X
|
- R9 M370X
|
||||||
* Intel GPUs:
|
* Intel GPUs:
|
||||||
- Iris
|
- Iris
|
||||||
|
@ -128,16 +138,16 @@ In summary, tuning the entire library for your device can be done as follows (st
|
||||||
make
|
make
|
||||||
|
|
||||||
|
|
||||||
Compiling the tests (optional)
|
Compiling the correctness and performance tests (optional)
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled:
|
To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled:
|
||||||
|
|
||||||
cmake -DTESTS=ON ..
|
cmake -DTESTS=ON ..
|
||||||
|
|
||||||
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. The library clBLAS is therefore required to be installed on your system for the CLBlast tests.
|
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against. If both are present, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables.
|
||||||
|
|
||||||
With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run both CLBlast and clBLAS in a head-to-head performance test.
|
With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library.
|
||||||
|
|
||||||
|
|
||||||
Performance remarks
|
Performance remarks
|
||||||
|
@ -161,64 +171,77 @@ These graphs can be generated automatically on your own device. First, compile C
|
||||||
Supported routines
|
Supported routines
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
CLBlast is in active development but already supports almost all the BLAS routines. The currently supported routines are marked with '✔' in the following tables. Empty boxes represent routines that still need to be implemented in a future release, whereas routines marked with '-' are not part of BLAS at all.
|
CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all.
|
||||||
|
|
||||||
| Level-1 | S | D | C | Z | Notes |
|
| Level-1 | S | D | C | Z |
|
||||||
| ---------|---|---|---|---|---------|
|
| ---------|---|---|---|---|
|
||||||
| xROTG | | | - | - | |
|
| xSWAP | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xROTMG | | | - | - | |
|
| xSCAL | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xROT | | | - | - | |
|
| xCOPY | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xROTM | | | - | - | |
|
| xAXPY | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xSWAP | ✔ | ✔ | ✔ | ✔ | |
|
| xDOT | ✔ | ✔ | - | - |
|
||||||
| xSCAL | ✔ | ✔ | ✔ | ✔ | +CS +ZD |
|
| xDOTU | - | - | ✔ | ✔ |
|
||||||
| xCOPY | ✔ | ✔ | ✔ | ✔ | |
|
| xDOTC | - | - | ✔ | ✔ |
|
||||||
| xAXPY | ✔ | ✔ | ✔ | ✔ | |
|
| xNRM2 | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xDOT | ✔ | ✔ | - | - | |
|
| xASUM | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xDOTU | - | - | ✔ | ✔ | |
|
| IxAMAX | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xDOTC | - | - | ✔ | ✔ | |
|
|
||||||
| xNRM2 | | | - | - | +SC +DZ |
|
|
||||||
| xASUM | | | - | - | +SC +DZ |
|
|
||||||
| IxAMAX | | | | | |
|
|
||||||
|
|
||||||
| Level-2 | S | D | C | Z | Notes |
|
| Level-2 | S | D | C | Z |
|
||||||
| ---------|---|---|---|---|---------|
|
| ---------|---|---|---|---|
|
||||||
| xGEMV | ✔ | ✔ | ✔ | ✔ | |
|
| xGEMV | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xGBMV | ✔ | ✔ | ✔ | ✔ | |
|
| xGBMV | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xHEMV | - | - | ✔ | ✔ | |
|
| xHEMV | - | - | ✔ | ✔ |
|
||||||
| xHBMV | - | - | ✔ | ✔ | |
|
| xHBMV | - | - | ✔ | ✔ |
|
||||||
| xHPMV | - | - | ✔ | ✔ | |
|
| xHPMV | - | - | ✔ | ✔ |
|
||||||
| xSYMV | ✔ | ✔ | - | - | |
|
| xSYMV | ✔ | ✔ | - | - |
|
||||||
| xSBMV | ✔ | ✔ | - | - | |
|
| xSBMV | ✔ | ✔ | - | - |
|
||||||
| xSPMV | ✔ | ✔ | - | - | |
|
| xSPMV | ✔ | ✔ | - | - |
|
||||||
| xTRMV | ✔ | ✔ | ✔ | ✔ | |
|
| xTRMV | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xTBMV | ✔ | ✔ | ✔ | ✔ | |
|
| xTBMV | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xTPMV | ✔ | ✔ | ✔ | ✔ | |
|
| xTPMV | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xTRSV | | | | | |
|
| xGER | ✔ | ✔ | - | - |
|
||||||
| xTBSV | | | | | |
|
| xGERU | - | - | ✔ | ✔ |
|
||||||
| xTPSV | | | | | |
|
| xGERC | - | - | ✔ | ✔ |
|
||||||
| xGER | ✔ | ✔ | - | - | |
|
| xHER | - | - | ✔ | ✔ |
|
||||||
| xGERU | - | - | ✔ | ✔ | |
|
| xHPR | - | - | ✔ | ✔ |
|
||||||
| xGERC | - | - | ✔ | ✔ | |
|
| xHER2 | - | - | ✔ | ✔ |
|
||||||
| xHER | - | - | ✔ | ✔ | |
|
| xHPR2 | - | - | ✔ | ✔ |
|
||||||
| xHPR | - | - | ✔ | ✔ | |
|
| xSYR | ✔ | ✔ | - | - |
|
||||||
| xHER2 | - | - | ✔ | ✔ | |
|
| xSPR | ✔ | ✔ | - | - |
|
||||||
| xHPR2 | - | - | ✔ | ✔ | |
|
| xSYR2 | ✔ | ✔ | - | - |
|
||||||
| xSYR | ✔ | ✔ | - | - | |
|
| xSPR2 | ✔ | ✔ | - | - |
|
||||||
| xSPR | ✔ | ✔ | - | - | |
|
|
||||||
| xSYR2 | ✔ | ✔ | - | - | |
|
|
||||||
| xSPR2 | ✔ | ✔ | - | - | |
|
|
||||||
|
|
||||||
| Level-3 | S | D | C | Z | Notes |
|
| Level-3 | S | D | C | Z |
|
||||||
| ---------|---|---|---|---|---------|
|
| ---------|---|---|---|---|
|
||||||
| xGEMM | ✔ | ✔ | ✔ | ✔ | |
|
| xGEMM | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xSYMM | ✔ | ✔ | ✔ | ✔ | |
|
| xSYMM | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xHEMM | - | - | ✔ | ✔ | |
|
| xHEMM | - | - | ✔ | ✔ |
|
||||||
| xSYRK | ✔ | ✔ | ✔ | ✔ | |
|
| xSYRK | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xHERK | - | - | ✔ | ✔ | |
|
| xHERK | - | - | ✔ | ✔ |
|
||||||
| xSYR2K | ✔ | ✔ | ✔ | ✔ | |
|
| xSYR2K | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xHER2K | - | - | ✔ | ✔ | |
|
| xHER2K | - | - | ✔ | ✔ |
|
||||||
| xTRMM | ✔ | ✔ | ✔ | ✔ | |
|
| xTRMM | ✔ | ✔ | ✔ | ✔ |
|
||||||
| xTRSM | | | | | |
|
|
||||||
|
In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care:
|
||||||
|
|
||||||
|
| Additional | S | D | C | Z |
|
||||||
|
| -----------|---|---|---|---|
|
||||||
|
| xSUM | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
| IxMAX | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
| IxMIN | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
|
||||||
|
Some BLAS routines are not supported yet by CLBlast. They are shown in the following table:
|
||||||
|
|
||||||
|
| Unsupported | S | D | C | Z |
|
||||||
|
| ------------|---|---|---|---|
|
||||||
|
| xROTG | | | - | - |
|
||||||
|
| xROTMG | | | - | - |
|
||||||
|
| xROT | | | - | - |
|
||||||
|
| xROTM | | | - | - |
|
||||||
|
| xTRSV | | | | |
|
||||||
|
| xTBSV | | | | |
|
||||||
|
| xTPSV | | | | |
|
||||||
|
| xTRSM | | | | |
|
||||||
|
|
||||||
|
|
||||||
Contributing
|
Contributing
|
||||||
|
@ -226,28 +249,28 @@ Contributing
|
||||||
|
|
||||||
Contributions are welcome in the form of tuning results for OpenCL devices previously untested. Furthermore, merge requests are welcome as long as they contain unit additions or modifications. Furthermore, they should follow the CLBlast coding style, which is based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers.
|
Contributions are welcome in the form of tuning results for OpenCL devices previously untested. Furthermore, merge requests are welcome as long as they contain unit additions or modifications. Furthermore, they should follow the CLBlast coding style, which is based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers.
|
||||||
|
|
||||||
The contributing authors so far are:
|
The contributing authors (code, pull requests, testing) so far are:
|
||||||
|
|
||||||
* [Cedric Nugteren](http://www.cedricnugteren.nl)
|
* [Cedric Nugteren](http://www.cedricnugteren.nl)
|
||||||
|
* [Anton Lokhmotov](https://github.com/psyhtest)
|
||||||
|
* [Dragan Djuric](https://github.com/blueberry)
|
||||||
|
* [Hugh Perkins](https://github.com/hughperkins)
|
||||||
|
|
||||||
Tuning and testing on a variety of OpenCL devices was made possible by:
|
Tuning and testing on a variety of OpenCL devices was made possible by:
|
||||||
|
|
||||||
* [TU/e ES research group](http://www.es.ele.tue.nl/)
|
* [TU/e ES research group](http://www.es.ele.tue.nl/)
|
||||||
* [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
|
* [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
|
||||||
* [Dividiti](http://www.dividiti.com)
|
* [dividiti](http://www.dividiti.com)
|
||||||
* [SURFsara HPC center](http://www.surfsara.com)
|
* [SURFsara HPC center](http://www.surfsara.com)
|
||||||
|
|
||||||
Support us
|
Support us
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
This project started in March 2015 as an evenings and weekends free-time project next to a full-time job. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).
|
This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).
|
||||||
|
|
||||||
|
|
||||||
To-do list before release of version 1.0
|
To-do list before release of version 1.0
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
- Support all routines supported by clBLAS
|
|
||||||
- Allow the user control over events and synchronization
|
|
||||||
- Add half-precision routines (e.g. HGEMM)
|
- Add half-precision routines (e.g. HGEMM)
|
||||||
- Enable correctness and performance testing against a CPU-based BLAS library
|
- Add API documentation
|
||||||
- Test in multi-threaded environments
|
|
||||||
|
|
75
cmake/Modules/FindCBLAS.cmake
Normal file
75
cmake/Modules/FindCBLAS.cmake
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
|
||||||
|
# ==================================================================================================
|
||||||
|
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
# width of 100 characters per line.
|
||||||
|
#
|
||||||
|
# Author(s):
|
||||||
|
# Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
#
|
||||||
|
# ==================================================================================================
|
||||||
|
#
|
||||||
|
# Defines the following variables:
|
||||||
|
# CBLAS_FOUND Boolean holding whether or not the Netlib BLAS library was found
|
||||||
|
# CBLAS_INCLUDE_DIRS The Netlib BLAS include directory
|
||||||
|
# CBLAS_LIBRARIES The Netlib BLAS library
|
||||||
|
#
|
||||||
|
# In case BLAS is not installed in the default directory, set the CBLAS_ROOT variable to point to
|
||||||
|
# the root of BLAS, such that 'cblas.h' can be found in $CBLAS_ROOT/include. This can either be
|
||||||
|
# done using an environmental variable (e.g. export CBLAS_ROOT=/path/to/BLAS) or using a CMake
|
||||||
|
# variable (e.g. cmake -DCBLAS_ROOT=/path/to/BLAS ..).
|
||||||
|
#
|
||||||
|
# ==================================================================================================
|
||||||
|
|
||||||
|
# Sets the possible install locations
|
||||||
|
set(CBLAS_HINTS
|
||||||
|
${CBLAS_ROOT}
|
||||||
|
$ENV{CBLAS_ROOT}
|
||||||
|
)
|
||||||
|
set(CBLAS_PATHS
|
||||||
|
/usr
|
||||||
|
/usr/local
|
||||||
|
/usr/local/opt
|
||||||
|
/System/Library/Frameworks
|
||||||
|
)
|
||||||
|
|
||||||
|
# Finds the include directories
|
||||||
|
find_path(CBLAS_INCLUDE_DIRS
|
||||||
|
NAMES cblas.h
|
||||||
|
HINTS ${CBLAS_HINTS}
|
||||||
|
PATH_SUFFIXES
|
||||||
|
include inc include/x86_64 include/x64
|
||||||
|
openblas/include include/blis blis/include blis/include/blis
|
||||||
|
Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers
|
||||||
|
PATHS ${CBLAS_PATHS}
|
||||||
|
DOC "Netlib BLAS include header cblas.h"
|
||||||
|
)
|
||||||
|
mark_as_advanced(CBLAS_INCLUDE_DIRS)
|
||||||
|
|
||||||
|
# Finds the library
|
||||||
|
find_library(CBLAS_LIBRARIES
|
||||||
|
NAMES cblas blas mkl blis openblas accelerate
|
||||||
|
HINTS ${CBLAS_HINTS}
|
||||||
|
PATH_SUFFIXES
|
||||||
|
lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
|
||||||
|
openblas/lib blis/lib lib/atlas-base
|
||||||
|
PATHS ${CBLAS_PATHS}
|
||||||
|
DOC "Netlib BLAS library"
|
||||||
|
)
|
||||||
|
mark_as_advanced(CBLAS_LIBRARIES)
|
||||||
|
|
||||||
|
# ==================================================================================================
|
||||||
|
|
||||||
|
# Notification messages
|
||||||
|
if(NOT CBLAS_INCLUDE_DIRS)
|
||||||
|
message(STATUS "Could NOT find 'cblas.h', install a CPU Netlib BLAS or set CBLAS_ROOT")
|
||||||
|
endif()
|
||||||
|
if(NOT CBLAS_LIBRARIES)
|
||||||
|
message(STATUS "Could NOT find a CPU Netlib BLAS library, install it or set CBLAS_ROOT")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Determines whether or not BLAS was found
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES)
|
||||||
|
|
||||||
|
# ==================================================================================================
|
2434
doc/clblast.md
Normal file
2434
doc/clblast.md
Normal file
File diff suppressed because it is too large
Load diff
|
@ -73,11 +73,11 @@ enum class StatusCode {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Matrix layout and transpose types
|
// Matrix layout and transpose types
|
||||||
enum class Layout { kRowMajor, kColMajor };
|
enum class Layout { kRowMajor = 101, kColMajor = 102 };
|
||||||
enum class Transpose { kNo, kYes, kConjugate };
|
enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 };
|
||||||
enum class Side { kLeft, kRight };
|
enum class Triangle { kUpper = 121, kLower = 122 };
|
||||||
enum class Triangle { kUpper, kLower };
|
enum class Diagonal { kNonUnit = 131, kUnit = 132 };
|
||||||
enum class Diagonal { kUnit, kNonUnit };
|
enum class Side { kLeft = 141, kRight = 142 };
|
||||||
|
|
||||||
// Precision scoped enum (values in bits)
|
// Precision scoped enum (values in bits)
|
||||||
enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
|
enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
|
||||||
|
@ -87,26 +87,60 @@ enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
|
||||||
// BLAS level-1 (vector-vector) routines
|
// BLAS level-1 (vector-vector) routines
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Generate givens plane rotation: SROTG/DROTG
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Rotg(cl_mem sa_buffer, const size_t sa_offset,
|
||||||
|
cl_mem sb_buffer, const size_t sb_offset,
|
||||||
|
cl_mem sc_buffer, const size_t sc_offset,
|
||||||
|
cl_mem ss_buffer, const size_t ss_offset,
|
||||||
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
|
// Generate modified givens plane rotation: SROTMG/DROTMG
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
|
// Apply givens plane rotation: SROT/DROT
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Rot(const size_t n,
|
||||||
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const T cos,
|
||||||
|
const T sin,
|
||||||
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
|
// Apply modified givens plane rotation: SROTM/DROTM
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Rotm(const size_t n,
|
||||||
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
|
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Swap(const size_t n,
|
StatusCode Swap(const size_t n,
|
||||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
|
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Scal(const size_t n,
|
StatusCode Scal(const size_t n,
|
||||||
const T alpha,
|
const T alpha,
|
||||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
|
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Copy(const size_t n,
|
StatusCode Copy(const size_t n,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
|
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -114,7 +148,7 @@ StatusCode Axpy(const size_t n,
|
||||||
const T alpha,
|
const T alpha,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Dot product of two vectors: SDOT/DDOT
|
// Dot product of two vectors: SDOT/DDOT
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -122,7 +156,7 @@ StatusCode Dot(const size_t n,
|
||||||
cl_mem dot_buffer, const size_t dot_offset,
|
cl_mem dot_buffer, const size_t dot_offset,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Dot product of two complex vectors: CDOTU/ZDOTU
|
// Dot product of two complex vectors: CDOTU/ZDOTU
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -130,7 +164,7 @@ StatusCode Dotu(const size_t n,
|
||||||
cl_mem dot_buffer, const size_t dot_offset,
|
cl_mem dot_buffer, const size_t dot_offset,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
|
// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -138,7 +172,49 @@ StatusCode Dotc(const size_t n,
|
||||||
cl_mem dot_buffer, const size_t dot_offset,
|
cl_mem dot_buffer, const size_t dot_offset,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
|
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Nrm2(const size_t n,
|
||||||
|
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
|
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Asum(const size_t n,
|
||||||
|
cl_mem asum_buffer, const size_t asum_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
|
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Sum(const size_t n,
|
||||||
|
cl_mem sum_buffer, const size_t sum_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
|
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Amax(const size_t n,
|
||||||
|
cl_mem imax_buffer, const size_t imax_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
|
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Max(const size_t n,
|
||||||
|
cl_mem imax_buffer, const size_t imax_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
|
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Min(const size_t n,
|
||||||
|
cl_mem imin_buffer, const size_t imin_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
// BLAS level-2 (matrix-vector) routines
|
// BLAS level-2 (matrix-vector) routines
|
||||||
|
@ -153,7 +229,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
|
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -164,7 +240,7 @@ StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
|
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -175,7 +251,7 @@ StatusCode Hemv(const Layout layout, const Triangle triangle,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
|
// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -186,7 +262,7 @@ StatusCode Hbmv(const Layout layout, const Triangle triangle,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
|
// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -197,7 +273,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Symmetric matrix-vector multiplication: SSYMV/DSYMV
|
// Symmetric matrix-vector multiplication: SSYMV/DSYMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -208,7 +284,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
|
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -219,7 +295,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
|
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -230,7 +306,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
|
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -238,7 +314,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_
|
||||||
const size_t n,
|
const size_t n,
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
|
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -246,7 +322,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_
|
||||||
const size_t n, const size_t k,
|
const size_t n, const size_t k,
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
|
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -254,7 +330,7 @@ StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_
|
||||||
const size_t n,
|
const size_t n,
|
||||||
const cl_mem ap_buffer, const size_t ap_offset,
|
const cl_mem ap_buffer, const size_t ap_offset,
|
||||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
|
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -262,7 +338,7 @@ StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_
|
||||||
const size_t n,
|
const size_t n,
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
|
// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -270,7 +346,7 @@ StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_
|
||||||
const size_t n, const size_t k,
|
const size_t n, const size_t k,
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
|
// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -278,7 +354,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_
|
||||||
const size_t n,
|
const size_t n,
|
||||||
const cl_mem ap_buffer, const size_t ap_offset,
|
const cl_mem ap_buffer, const size_t ap_offset,
|
||||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// General rank-1 matrix update: SGER/DGER
|
// General rank-1 matrix update: SGER/DGER
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -288,7 +364,7 @@ StatusCode Ger(const Layout layout,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// General rank-1 complex matrix update: CGERU/ZGERU
|
// General rank-1 complex matrix update: CGERU/ZGERU
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -298,7 +374,7 @@ StatusCode Geru(const Layout layout,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// General rank-1 complex conjugated matrix update: CGERC/ZGERC
|
// General rank-1 complex conjugated matrix update: CGERC/ZGERC
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -308,7 +384,7 @@ StatusCode Gerc(const Layout layout,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Hermitian rank-1 matrix update: CHER/ZHER
|
// Hermitian rank-1 matrix update: CHER/ZHER
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -317,7 +393,7 @@ StatusCode Her(const Layout layout, const Triangle triangle,
|
||||||
const T alpha,
|
const T alpha,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Hermitian packed rank-1 matrix update: CHPR/ZHPR
|
// Hermitian packed rank-1 matrix update: CHPR/ZHPR
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -326,7 +402,7 @@ StatusCode Hpr(const Layout layout, const Triangle triangle,
|
||||||
const T alpha,
|
const T alpha,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem ap_buffer, const size_t ap_offset,
|
cl_mem ap_buffer, const size_t ap_offset,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Hermitian rank-2 matrix update: CHER2/ZHER2
|
// Hermitian rank-2 matrix update: CHER2/ZHER2
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -336,7 +412,7 @@ StatusCode Her2(const Layout layout, const Triangle triangle,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
|
// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -346,7 +422,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem ap_buffer, const size_t ap_offset,
|
cl_mem ap_buffer, const size_t ap_offset,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Symmetric rank-1 matrix update: SSYR/DSYR
|
// Symmetric rank-1 matrix update: SSYR/DSYR
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -355,7 +431,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
|
||||||
const T alpha,
|
const T alpha,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Symmetric packed rank-1 matrix update: SSPR/DSPR
|
// Symmetric packed rank-1 matrix update: SSPR/DSPR
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -364,7 +440,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
|
||||||
const T alpha,
|
const T alpha,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
cl_mem ap_buffer, const size_t ap_offset,
|
cl_mem ap_buffer, const size_t ap_offset,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Symmetric rank-2 matrix update: SSYR2/DSYR2
|
// Symmetric rank-2 matrix update: SSYR2/DSYR2
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -374,7 +450,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
|
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -384,7 +460,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle,
|
||||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
cl_mem ap_buffer, const size_t ap_offset,
|
cl_mem ap_buffer, const size_t ap_offset,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
// BLAS level-3 (matrix-matrix) routines
|
// BLAS level-3 (matrix-matrix) routines
|
||||||
|
@ -399,7 +475,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
|
||||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
|
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -410,7 +486,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
|
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -421,7 +497,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
|
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -431,7 +507,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Rank-K update of a hermitian matrix: CHERK/ZHERK
|
// Rank-K update of a hermitian matrix: CHERK/ZHERK
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -441,7 +517,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
|
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -452,7 +528,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a
|
||||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
const T beta,
|
const T beta,
|
||||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
|
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
|
||||||
template <typename T, typename U>
|
template <typename T, typename U>
|
||||||
|
@ -463,7 +539,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
|
||||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
const U beta,
|
const U beta,
|
||||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
|
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -472,7 +548,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
|
||||||
const T alpha,
|
const T alpha,
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
|
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -481,9 +557,20 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c
|
||||||
const T alpha,
|
const T alpha,
|
||||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||||
cl_command_queue* queue, cl_event* event);
|
cl_command_queue* queue, cl_event* event = nullptr);
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
|
||||||
|
// for the same device. This cache can be cleared to free up system memory or in case of debugging.
|
||||||
|
StatusCode ClearCache();
|
||||||
|
|
||||||
|
// The cache can also be pre-initialized for a specific device with all possible CLBLast kernels.
|
||||||
|
// Further CLBlast routine calls will then run at maximum speed.
|
||||||
|
StatusCode FillCache(const cl_device_id device);
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
} // namespace clblast
|
} // namespace clblast
|
||||||
|
|
||||||
// CLBLAST_CLBLAST_H_
|
// CLBLAST_CLBLAST_H_
|
||||||
|
|
File diff suppressed because it is too large
Load diff
100
include/internal/cache.h
Normal file
100
include/internal/cache.h
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the caching functionality of compiled binaries and programs.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_CACHE_H_
|
||||||
|
#define CLBLAST_CACHE_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
|
#include "internal/utilities.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
namespace cache {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The cache of compiled OpenCL binaries, along with some meta-data
|
||||||
|
struct BinaryCache {
|
||||||
|
std::string binary;
|
||||||
|
std::string device_name;
|
||||||
|
Precision precision;
|
||||||
|
std::string routine_name_;
|
||||||
|
|
||||||
|
// Finds out whether the properties match
|
||||||
|
bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
|
||||||
|
const std::string &ref_routine) {
|
||||||
|
return (device_name == ref_device &&
|
||||||
|
precision == ref_precision &&
|
||||||
|
routine_name_ == ref_routine);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// The actual cache, implemented as a vector of the above data-type, and its mutex
|
||||||
|
static std::vector<BinaryCache> binary_cache_;
|
||||||
|
static std::mutex binary_cache_mutex_;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The cache of compiled OpenCL programs, along with some meta-data
|
||||||
|
struct ProgramCache {
|
||||||
|
Program program;
|
||||||
|
ContextPointer context_ptr;
|
||||||
|
Precision precision;
|
||||||
|
std::string routine_name_;
|
||||||
|
|
||||||
|
// Finds out whether the properties match
|
||||||
|
bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision,
|
||||||
|
const std::string &ref_routine) {
|
||||||
|
return (context_ptr == ref_context &&
|
||||||
|
precision == ref_precision &&
|
||||||
|
routine_name_ == ref_routine);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// The actual cache, implemented as a vector of the above data-type, and its mutex
|
||||||
|
static std::vector<ProgramCache> program_cache_;
|
||||||
|
static std::mutex program_cache_mutex_;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Stores the compiled binary or program in the cache
|
||||||
|
void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
|
||||||
|
const Precision &precision, const std::string &routine_name);
|
||||||
|
void StoreProgramToCache(const Program &program, const Context &context,
|
||||||
|
const Precision &precision, const std::string &routine_name);
|
||||||
|
|
||||||
|
// Queries the cache and retrieves a matching binary or program. Assumes that the match is
|
||||||
|
// available, throws otherwise.
|
||||||
|
const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
|
||||||
|
const std::string &routine_name);
|
||||||
|
const Program& GetProgramFromCache(const Context &context, const Precision &precision,
|
||||||
|
const std::string &routine_name);
|
||||||
|
|
||||||
|
// Queries the cache to see whether or not the compiled kernel is already there
|
||||||
|
bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
|
||||||
|
const std::string &routine_name);
|
||||||
|
bool ProgramIsInCache(const Context &context, const Precision &precision,
|
||||||
|
const std::string &routine_name);
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Clears the cache of stored binaries
|
||||||
|
StatusCode ClearCache();
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace cache
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_CACHE_H_
|
||||||
|
#endif
|
|
@ -78,11 +78,16 @@ class Event {
|
||||||
// Regular constructor
|
// Regular constructor
|
||||||
explicit Event(): event_(nullptr) { }
|
explicit Event(): event_(nullptr) { }
|
||||||
|
|
||||||
|
// Waits for completion of this event
|
||||||
|
void WaitForCompletion() const {
|
||||||
|
CheckError(clWaitForEvents(1, &event_));
|
||||||
|
}
|
||||||
|
|
||||||
// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
|
// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
|
||||||
// the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
|
// the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
|
||||||
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
|
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
|
||||||
float GetElapsedTime() const {
|
float GetElapsedTime() const {
|
||||||
CheckError(clWaitForEvents(1, &event_));
|
WaitForCompletion();
|
||||||
auto bytes = size_t{0};
|
auto bytes = size_t{0};
|
||||||
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
|
clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
|
||||||
auto time_start = size_t{0};
|
auto time_start = size_t{0};
|
||||||
|
@ -95,10 +100,14 @@ class Event {
|
||||||
|
|
||||||
// Accessor to the private data-member
|
// Accessor to the private data-member
|
||||||
cl_event& operator()() { return event_; }
|
cl_event& operator()() { return event_; }
|
||||||
|
cl_event* pointer() { return &event_; }
|
||||||
private:
|
private:
|
||||||
cl_event event_;
|
cl_event event_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Pointer to an OpenCL event
|
||||||
|
using EventPointer = cl_event*;
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// C++11 version of 'cl_platform_id'
|
// C++11 version of 'cl_platform_id'
|
||||||
|
@ -260,10 +269,14 @@ class Context {
|
||||||
|
|
||||||
// Accessor to the private data-member
|
// Accessor to the private data-member
|
||||||
const cl_context& operator()() const { return *context_; }
|
const cl_context& operator()() const { return *context_; }
|
||||||
|
cl_context* pointer() const { return &(*context_); }
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<cl_context> context_;
|
std::shared_ptr<cl_context> context_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Pointer to an OpenCL context
|
||||||
|
using ContextPointer = cl_context*;
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Enumeration of build statuses of the run-time compilation process
|
// Enumeration of build statuses of the run-time compilation process
|
||||||
|
@ -274,7 +287,7 @@ class Program {
|
||||||
public:
|
public:
|
||||||
// Note that there is no constructor based on the regular OpenCL data-type because of extra state
|
// Note that there is no constructor based on the regular OpenCL data-type because of extra state
|
||||||
|
|
||||||
// Regular constructor with memory management
|
// Source-based constructor with memory management
|
||||||
explicit Program(const Context &context, std::string source):
|
explicit Program(const Context &context, std::string source):
|
||||||
program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
|
program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
|
||||||
length_(source.length()),
|
length_(source.length()),
|
||||||
|
@ -285,6 +298,22 @@ class Program {
|
||||||
CheckError(status);
|
CheckError(status);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Binary-based constructor with memory management
|
||||||
|
explicit Program(const Device &device, const Context &context, const std::string& binary):
|
||||||
|
program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
|
||||||
|
length_(binary.length()),
|
||||||
|
source_(binary),
|
||||||
|
source_ptr_(&source_[0]) {
|
||||||
|
auto status1 = CL_SUCCESS;
|
||||||
|
auto status2 = CL_SUCCESS;
|
||||||
|
const cl_device_id dev = device();
|
||||||
|
*program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
|
||||||
|
reinterpret_cast<const unsigned char**>(&source_ptr_),
|
||||||
|
&status1, &status2);
|
||||||
|
CheckError(status1);
|
||||||
|
CheckError(status2);
|
||||||
|
}
|
||||||
|
|
||||||
// Compiles the device program and returns whether or not there where any warnings/errors
|
// Compiles the device program and returns whether or not there where any warnings/errors
|
||||||
BuildStatus Build(const Device &device, std::vector<std::string> &options) {
|
BuildStatus Build(const Device &device, std::vector<std::string> &options) {
|
||||||
auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
|
auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
|
||||||
|
@ -313,7 +342,7 @@ class Program {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retrieves an intermediate representation of the compiled program
|
// Retrieves a binary or an intermediate representation of the compiled program
|
||||||
std::string GetIR() const {
|
std::string GetIR() const {
|
||||||
auto bytes = size_t{0};
|
auto bytes = size_t{0};
|
||||||
CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
|
CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
|
||||||
|
@ -329,7 +358,7 @@ class Program {
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<cl_program> program_;
|
std::shared_ptr<cl_program> program_;
|
||||||
size_t length_;
|
size_t length_;
|
||||||
std::string source_;
|
std::string source_; // Note: the source can also be a binary or IR
|
||||||
const char* source_ptr_;
|
const char* source_ptr_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -468,31 +497,33 @@ class Buffer {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copies from device to host: reading the device buffer a-synchronously
|
// Copies from device to host: reading the device buffer a-synchronously
|
||||||
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
|
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
|
||||||
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
|
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
|
||||||
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
|
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
|
||||||
host, 0, nullptr, nullptr));
|
host, 0, nullptr, nullptr));
|
||||||
}
|
}
|
||||||
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
|
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
|
||||||
const size_t offset = 0) {
|
const size_t offset = 0) const {
|
||||||
if (host.size() < size) { Error("target host buffer is too small"); }
|
if (host.size() < size) { Error("target host buffer is too small"); }
|
||||||
ReadAsync(queue, size, host.data(), offset);
|
ReadAsync(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
|
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
|
||||||
const size_t offset = 0) {
|
const size_t offset = 0) const {
|
||||||
if (host.size() < size) { Error("target host buffer is too small"); }
|
if (host.size() < size) { Error("target host buffer is too small"); }
|
||||||
ReadAsync(queue, size, host.data(), offset);
|
ReadAsync(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copies from device to host: reading the device buffer
|
// Copies from device to host: reading the device buffer
|
||||||
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) {
|
void Read(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
|
||||||
ReadAsync(queue, size, host, offset);
|
ReadAsync(queue, size, host, offset);
|
||||||
queue.Finish();
|
queue.Finish();
|
||||||
}
|
}
|
||||||
void Read(const Queue &queue, const size_t size, std::vector<T> &host, const size_t offset = 0) {
|
void Read(const Queue &queue, const size_t size, std::vector<T> &host,
|
||||||
|
const size_t offset = 0) const {
|
||||||
Read(queue, size, host.data(), offset);
|
Read(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
void Read(const Queue &queue, const size_t size, BufferHost<T> &host, const size_t offset = 0) {
|
void Read(const Queue &queue, const size_t size, BufferHost<T> &host,
|
||||||
|
const size_t offset = 0) const {
|
||||||
Read(queue, size, host.data(), offset);
|
Read(queue, size, host.data(), offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -601,17 +632,37 @@ class Kernel {
|
||||||
|
|
||||||
// Launches a kernel onto the specified queue
|
// Launches a kernel onto the specified queue
|
||||||
void Launch(const Queue &queue, const std::vector<size_t> &global,
|
void Launch(const Queue &queue, const std::vector<size_t> &global,
|
||||||
const std::vector<size_t> &local, Event &event) {
|
const std::vector<size_t> &local, EventPointer event) {
|
||||||
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
|
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
|
||||||
nullptr, global.data(), local.data(),
|
nullptr, global.data(), local.data(),
|
||||||
0, nullptr, &(event())));
|
0, nullptr, event));
|
||||||
|
}
|
||||||
|
|
||||||
|
// As above, but with an event waiting list
|
||||||
|
void Launch(const Queue &queue, const std::vector<size_t> &global,
|
||||||
|
const std::vector<size_t> &local, EventPointer event,
|
||||||
|
std::vector<Event>& waitForEvents) {
|
||||||
|
if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
|
||||||
|
|
||||||
|
// Builds a plain version of the events waiting list
|
||||||
|
auto waitForEventsPlain = std::vector<cl_event>();
|
||||||
|
for (auto &waitEvent : waitForEvents) {
|
||||||
|
waitForEventsPlain.push_back(waitEvent());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Launches the kernel while waiting for other events
|
||||||
|
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
|
||||||
|
nullptr, global.data(), local.data(),
|
||||||
|
static_cast<cl_uint>(waitForEventsPlain.size()),
|
||||||
|
waitForEventsPlain.data(),
|
||||||
|
event));
|
||||||
}
|
}
|
||||||
|
|
||||||
// As above, but with the default local workgroup size
|
// As above, but with the default local workgroup size
|
||||||
void Launch(const Queue &queue, const std::vector<size_t> &global, Event &event) {
|
void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
|
||||||
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
|
CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
|
||||||
nullptr, global.data(), nullptr,
|
nullptr, global.data(), nullptr,
|
||||||
0, nullptr, &(event())));
|
0, nullptr, event));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Accessor to the private data-member
|
// Accessor to the private data-member
|
||||||
|
|
|
@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::CopySingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
||||||
|
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
||||||
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
|
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
|
||||||
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -78,6 +80,8 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
|
||||||
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
|
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
|
||||||
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
|
@ -129,6 +133,8 @@ const Database::DatabaseEntry Database::CopyDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
|
||||||
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
|
@ -181,8 +187,10 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
|
{ "Hawaii", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
|
||||||
|
{ "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
|
|
@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::PadSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
|
||||||
|
{ "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
|
@ -78,8 +80,10 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
|
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -124,7 +128,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -137,8 +141,10 @@ const Database::DatabaseEntry Database::PadDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
|
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
|
||||||
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -189,6 +195,8 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Hawaii", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
|
{ "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Hawaii", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
}
|
}
|
||||||
|
@ -78,8 +80,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -137,8 +141,10 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
{ "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
{ "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
||||||
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -189,6 +195,8 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Hawaii", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
|
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
|
||||||
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
||||||
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::TransposeSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
|
||||||
|
{ "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
|
||||||
|
{ "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -78,6 +80,8 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
|
{ "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
|
{ "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
|
@ -131,8 +135,10 @@ const Database::DatabaseEntry Database::TransposeDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
|
{ "Hawaii", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
|
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
|
||||||
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
||||||
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -183,6 +189,8 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
|
||||||
|
{ "Hawaii", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::XaxpySingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
|
||||||
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
|
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
|
||||||
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
|
@ -78,6 +80,8 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",2}, {"WGS",64}, {"WPT",8} } },
|
||||||
|
{ "Hawaii", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
|
@ -137,6 +141,8 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
|
||||||
|
{ "Hawaii", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
|
@ -171,12 +177,12 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
|
||||||
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
|
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
|
||||||
{ "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
|
{ "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
|
||||||
{ "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
|
{ "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
|
||||||
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -189,8 +195,10 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
|
{ "Hawaii", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
|
||||||
|
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
{ "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
{ "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
||||||
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
|
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
|
|
@ -18,54 +18,38 @@ const Database::DatabaseEntry Database::XdotSingle = {
|
||||||
"Xdot", Precision::kSingle, {
|
"Xdot", Precision::kSingle, {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
{ "Tahiti", { {"VW",1}, {"WGS1",256}, {"WGS2",256} } },
|
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
}
|
{ "Tahiti", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
},
|
{ "default", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
{ // ARM GPUs
|
|
||||||
kDeviceTypeGPU, "ARM", {
|
|
||||||
{ "Mali-T628", { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",256} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel CPUs
|
{ // Intel CPUs
|
||||||
kDeviceTypeCPU, "Intel", {
|
kDeviceTypeCPU, "Intel", {
|
||||||
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
|
||||||
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
{ "default", { {"WGS1",1024}, {"WGS2",32} } },
|
||||||
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, "Intel", {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"VW",1}, {"WGS1",512}, {"WGS2",32} } },
|
{ "Iris Pro", { {"WGS1",512}, {"WGS2",64} } },
|
||||||
{ "Iris Pro", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
|
{ "default", { {"WGS1",512}, {"WGS2",64} } },
|
||||||
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Intel accelerators
|
|
||||||
kDeviceTypeAccelerator, "Intel", {
|
|
||||||
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
kDeviceTypeGPU, "NVIDIA", {
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",256}, {"WGS2",128} } },
|
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
|
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } },
|
||||||
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
{ "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
{ "default", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
|
||||||
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
{ "default", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -77,54 +61,38 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
|
||||||
"Xdot", Precision::kComplexSingle, {
|
"Xdot", Precision::kComplexSingle, {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
|
||||||
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
|
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
}
|
{ "Tahiti", { {"WGS1",64}, {"WGS2",32} } },
|
||||||
},
|
{ "default", { {"WGS1",64}, {"WGS2",32} } },
|
||||||
{ // ARM GPUs
|
|
||||||
kDeviceTypeGPU, "ARM", {
|
|
||||||
{ "Mali-T628", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel CPUs
|
{ // Intel CPUs
|
||||||
kDeviceTypeCPU, "Intel", {
|
kDeviceTypeCPU, "Intel", {
|
||||||
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
|
||||||
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
{ "default", { {"WGS1",1024}, {"WGS2",32} } },
|
||||||
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel GPUs
|
{ // Intel GPUs
|
||||||
kDeviceTypeGPU, "Intel", {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
{ "Iris Pro", { {"WGS1",32}, {"WGS2",32} } },
|
||||||
{ "Iris Pro", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
|
{ "default", { {"WGS1",32}, {"WGS2",32} } },
|
||||||
{ "default", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Intel accelerators
|
|
||||||
kDeviceTypeAccelerator, "Intel", {
|
|
||||||
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
kDeviceTypeGPU, "NVIDIA", {
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
|
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",256}, {"WGS2",32} } },
|
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
|
||||||
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",128}, {"WGS2",32} } },
|
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
|
||||||
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
{ "default", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
{ "Tesla K20m", { {"VW",1}, {"WGS1",256}, {"WGS2",512} } },
|
|
||||||
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
{ "default", { {"WGS1",32}, {"WGS2",32} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -136,47 +104,32 @@ const Database::DatabaseEntry Database::XdotDouble = {
|
||||||
"Xdot", Precision::kDouble, {
|
"Xdot", Precision::kDouble, {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",128} } },
|
||||||
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
|
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
}
|
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
},
|
{ "default", { {"WGS1",64}, {"WGS2",32} } },
|
||||||
{ // ARM GPUs
|
|
||||||
kDeviceTypeGPU, "ARM", {
|
|
||||||
{ "Mali-T628", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel CPUs
|
{ // Intel CPUs
|
||||||
kDeviceTypeCPU, "Intel", {
|
kDeviceTypeCPU, "Intel", {
|
||||||
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",512}, {"WGS2",64} } },
|
||||||
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
{ "default", { {"WGS1",512}, {"WGS2",64} } },
|
||||||
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",1024}, {"WGS2",512} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",512} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Intel accelerators
|
|
||||||
kDeviceTypeAccelerator, "Intel", {
|
|
||||||
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
kDeviceTypeGPU, "NVIDIA", {
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",64}, {"WGS2",128} } },
|
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
|
||||||
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
{ "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",32}, {"WGS2",512} } },
|
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
|
{ "default", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
|
||||||
{ "Tesla K40m", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
|
{ "default", { {"WGS1",64}, {"WGS2",32} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -188,47 +141,32 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
|
||||||
"Xdot", Precision::kComplexDouble, {
|
"Xdot", Precision::kComplexDouble, {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",32} } },
|
||||||
{ "Tahiti", { {"VW",1}, {"WGS1",64}, {"WGS2",256} } },
|
{ "Hawaii", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",32} } },
|
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
}
|
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
|
||||||
},
|
{ "default", { {"WGS1",64}, {"WGS2",32} } },
|
||||||
{ // ARM GPUs
|
|
||||||
kDeviceTypeGPU, "ARM", {
|
|
||||||
{ "Mali-T628", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel CPUs
|
{ // Intel CPUs
|
||||||
kDeviceTypeCPU, "Intel", {
|
kDeviceTypeCPU, "Intel", {
|
||||||
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"VW",1}, {"WGS1",256}, {"WGS2",1024} } },
|
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"WGS1",1024}, {"WGS2",32} } },
|
||||||
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
{ "default", { {"WGS1",1024}, {"WGS2",32} } },
|
||||||
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"VW",1}, {"WGS1",512}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",64}, {"WGS2",1024} } },
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ // Intel accelerators
|
|
||||||
kDeviceTypeAccelerator, "Intel", {
|
|
||||||
{ "Intel(R) Many Integrated Core Acceleration Card", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",1024} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // NVIDIA GPUs
|
{ // NVIDIA GPUs
|
||||||
kDeviceTypeGPU, "NVIDIA", {
|
kDeviceTypeGPU, "NVIDIA", {
|
||||||
{ "GeForce GTX 480", { {"VW",1}, {"WGS1",512}, {"WGS2",512} } },
|
{ "GeForce GTX 480", { {"WGS1",512}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX 680", { {"VW",1}, {"WGS1",256}, {"WGS2",64} } },
|
{ "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } },
|
||||||
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
|
{ "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX 980", { {"VW",1}, {"WGS1",32}, {"WGS2",128} } },
|
{ "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS1",128}, {"WGS2",512} } },
|
{ "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
|
||||||
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS1",128}, {"WGS2",128} } },
|
{ "default", { {"WGS1",64}, {"WGS2",32} } },
|
||||||
{ "Tesla K20m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
|
||||||
{ "Tesla K40m", { {"VW",1}, {"WGS1",128}, {"WGS2",1024} } },
|
|
||||||
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",64} } },
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"VW",1}, {"WGS1",32}, {"WGS2",32} } },
|
{ "default", { {"WGS1",64}, {"WGS2",32} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::XgemmSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
|
||||||
|
{ "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
|
||||||
|
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
|
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
|
||||||
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -60,12 +62,12 @@ const Database::DatabaseEntry Database::XgemmSingle = {
|
||||||
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
|
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
|
||||||
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
||||||
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
||||||
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -78,8 +80,10 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
|
||||||
|
{ "Hawaii", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
|
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
|
||||||
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
|
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
|
||||||
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -100,7 +104,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
|
||||||
kDeviceTypeGPU, "Intel", {
|
kDeviceTypeGPU, "Intel", {
|
||||||
{ "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
|
{ "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel accelerators
|
{ // Intel accelerators
|
||||||
|
@ -119,12 +123,12 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
|
||||||
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
|
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
|
||||||
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
{ "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -137,8 +141,10 @@ const Database::DatabaseEntry Database::XgemmDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
|
||||||
|
{ "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
|
||||||
|
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
|
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
|
||||||
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -171,12 +177,12 @@ const Database::DatabaseEntry Database::XgemmDouble = {
|
||||||
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
|
{ "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
|
||||||
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -189,8 +195,10 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
||||||
|
{ "Hawaii", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
|
||||||
|
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -222,12 +230,12 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
|
||||||
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
|
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,8 @@ const Database::DatabaseEntry Database::XgemvSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
|
@ -71,8 +73,10 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
||||||
|
{ "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "Pitcairn", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // Intel CPUs
|
{ // Intel CPUs
|
||||||
|
@ -119,6 +123,8 @@ const Database::DatabaseEntry Database::XgemvDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
||||||
|
{ "Hawaii", { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
|
||||||
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
|
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
|
||||||
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
|
@ -164,6 +170,8 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
|
||||||
|
{ "Hawaii", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
|
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
{ "Tahiti", { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
{ "default", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,8 +19,10 @@ const Database::DatabaseEntry Database::XgerSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
{ "Hawaii", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
|
||||||
|
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
|
||||||
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
|
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
|
||||||
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
|
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -65,8 +67,10 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
|
||||||
|
{ "Hawaii", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
|
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
|
||||||
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
|
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -111,8 +115,10 @@ const Database::DatabaseEntry Database::XgerDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
|
||||||
|
{ "Hawaii", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
|
||||||
|
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
|
||||||
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
|
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
|
||||||
{ "default", { {"WGS1",32}, {"WGS2",2}, {"WPT",1} } },
|
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{ // ARM GPUs
|
{ // ARM GPUs
|
||||||
|
@ -138,7 +144,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
|
||||||
},
|
},
|
||||||
{ // Default
|
{ // Default
|
||||||
kDeviceTypeAll, "default", {
|
kDeviceTypeAll, "default", {
|
||||||
{ "default", { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
|
{ "default", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -151,6 +157,8 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
|
||||||
{ // AMD GPUs
|
{ // AMD GPUs
|
||||||
kDeviceTypeGPU, "AMD", {
|
kDeviceTypeGPU, "AMD", {
|
||||||
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
|
{ "AMD Radeon R9 M370X Compute Engine", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
{ "Hawaii", { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
|
||||||
|
{ "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
|
||||||
{ "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
|
{ "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
|
||||||
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
|
{ "default", { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
|
||||||
}
|
}
|
||||||
|
|
34
include/internal/public_api.h
Normal file
34
include/internal/public_api.h
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file provides macro's to define the public API. This is needed when building a Windows DLL.
|
||||||
|
// Note: this is only used for the C++ interface, the C interface has its own definition included in
|
||||||
|
// the header file itself.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_PUBLIC_API_H_
|
||||||
|
#define CLBLAST_PUBLIC_API_H_
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Exports library functions under Windows when building a DLL. See also:
|
||||||
|
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define PUBLIC_API __declspec(dllexport)
|
||||||
|
#else
|
||||||
|
#define PUBLIC_API
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_PUBLIC_API_H_
|
||||||
|
#endif
|
|
@ -19,6 +19,7 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "internal/cache.h"
|
||||||
#include "internal/utilities.h"
|
#include "internal/utilities.h"
|
||||||
#include "internal/database.h"
|
#include "internal/database.h"
|
||||||
|
|
||||||
|
@ -30,30 +31,11 @@ template <typename T>
|
||||||
class Routine {
|
class Routine {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// The cache of compiled OpenCL programs, along with some meta-data
|
|
||||||
struct ProgramCache {
|
|
||||||
Program program;
|
|
||||||
std::string device_name;
|
|
||||||
Precision precision;
|
|
||||||
std::string routine_name_;
|
|
||||||
|
|
||||||
// Finds out whether the properties match
|
|
||||||
bool MatchInCache(const std::string &ref_device, const Precision &ref_precision,
|
|
||||||
const std::string &ref_routine) {
|
|
||||||
return (device_name == ref_device &&
|
|
||||||
precision == ref_precision &&
|
|
||||||
routine_name_ == ref_routine);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// The actual cache, implemented as a vector of the above data-type
|
|
||||||
static std::vector<ProgramCache> program_cache_;
|
|
||||||
|
|
||||||
// Helper functions which check for errors in the status code
|
// Helper functions which check for errors in the status code
|
||||||
static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
|
static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
|
||||||
|
|
||||||
// Base class constructor
|
// Base class constructor
|
||||||
explicit Routine(Queue &queue, Event &event, const std::string &name,
|
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
|
||||||
const std::vector<std::string> &routines, const Precision precision);
|
const std::vector<std::string> &routines, const Precision precision);
|
||||||
|
|
||||||
// Set-up phase of the kernel
|
// Set-up phase of the kernel
|
||||||
|
@ -63,7 +45,12 @@ class Routine {
|
||||||
|
|
||||||
// Runs a kernel given the global and local thread sizes
|
// Runs a kernel given the global and local thread sizes
|
||||||
StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
|
StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
|
||||||
const std::vector<size_t> &local);
|
const std::vector<size_t> &local, EventPointer event,
|
||||||
|
std::vector<Event>& waitForEvents);
|
||||||
|
|
||||||
|
// As above, but without an event waiting list
|
||||||
|
StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
|
||||||
|
const std::vector<size_t> &local, EventPointer event);
|
||||||
|
|
||||||
// Tests for valid inputs of matrices A, B, and C
|
// Tests for valid inputs of matrices A, B, and C
|
||||||
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
|
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
|
||||||
|
@ -75,17 +62,22 @@ class Routine {
|
||||||
StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer,
|
StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer,
|
||||||
const size_t offset, const size_t data_size);
|
const size_t offset, const size_t data_size);
|
||||||
|
|
||||||
// Tests for valid inputs of vectors X and Y
|
// Tests for valid inputs of vector X and Y
|
||||||
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||||
const size_t inc, const size_t data_size);
|
const size_t inc, const size_t data_size);
|
||||||
StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||||
const size_t inc, const size_t data_size);
|
const size_t inc, const size_t data_size);
|
||||||
|
|
||||||
|
// Tests for valid inputs of other vectors
|
||||||
StatusCode TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
StatusCode TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
|
||||||
const size_t data_size);
|
const size_t data_size);
|
||||||
|
StatusCode TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
|
||||||
|
const size_t offset, const size_t data_size);
|
||||||
|
|
||||||
// Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
|
// Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
|
||||||
// to symmetric and triangular matrices through optional arguments.
|
// to symmetric and triangular matrices through optional arguments.
|
||||||
StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
|
StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
|
||||||
|
const size_t src_one, const size_t src_two,
|
||||||
const size_t src_ld, const size_t src_offset,
|
const size_t src_ld, const size_t src_offset,
|
||||||
const Buffer<T> &src,
|
const Buffer<T> &src,
|
||||||
const size_t dest_one, const size_t dest_two,
|
const size_t dest_one, const size_t dest_two,
|
||||||
|
@ -96,11 +88,29 @@ class Routine {
|
||||||
const bool upper = false, const bool lower = false,
|
const bool upper = false, const bool lower = false,
|
||||||
const bool diagonal_imag_zero = false);
|
const bool diagonal_imag_zero = false);
|
||||||
|
|
||||||
// Queries the cache and retrieve either a matching program or a boolean whether a match exists.
|
// Stores a newly compiled binary/program into the cache
|
||||||
// The first assumes that the program is available in the cache and will throw an exception
|
void StoreBinaryToCache(const std::string& binary) const {
|
||||||
// otherwise.
|
cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
|
||||||
const Program& GetProgramFromCache() const;
|
}
|
||||||
bool ProgramIsInCache() const;
|
void StoreProgramToCache(const Program& program) const {
|
||||||
|
cache::StoreProgramToCache(program, context_, precision_, routine_name_);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Queries the cache and retrieve either a matching binary/program or a boolean whether a match
|
||||||
|
// exists. The first assumes that the binary/program is available in the cache and will throw an
|
||||||
|
// exception otherwise.
|
||||||
|
std::string GetBinaryFromCache() const {
|
||||||
|
return cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
|
||||||
|
}
|
||||||
|
Program GetProgramFromCache() const {
|
||||||
|
return cache::GetProgramFromCache(context_, precision_, routine_name_);
|
||||||
|
}
|
||||||
|
bool BinaryIsInCache() const {
|
||||||
|
return cache::BinaryIsInCache(device_name_, precision_, routine_name_);
|
||||||
|
}
|
||||||
|
bool ProgramIsInCache() const {
|
||||||
|
return cache::ProgramIsInCache(context_, precision_, routine_name_);
|
||||||
|
}
|
||||||
|
|
||||||
// Non-static variable for the precision. Note that the same variable (but static) might exist in
|
// Non-static variable for the precision. Note that the same variable (but static) might exist in
|
||||||
// a derived class.
|
// a derived class.
|
||||||
|
@ -112,7 +122,7 @@ class Routine {
|
||||||
|
|
||||||
// The OpenCL objects, accessible only from derived classes
|
// The OpenCL objects, accessible only from derived classes
|
||||||
Queue queue_;
|
Queue queue_;
|
||||||
Event event_;
|
EventPointer event_;
|
||||||
const Context context_;
|
const Context context_;
|
||||||
const Device device_;
|
const Device device_;
|
||||||
|
|
||||||
|
|
56
include/internal/routines/level1/xamax.h
Normal file
56
include/internal/routines/level1/xamax.h
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xamax routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XAMAX_H_
|
||||||
|
#define CLBLAST_ROUTINES_XAMAX_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xamax: public Routine<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Members and methods from the base class
|
||||||
|
using Routine<T>::db_;
|
||||||
|
using Routine<T>::source_string_;
|
||||||
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
|
using Routine<T>::context_;
|
||||||
|
using Routine<T>::GetProgramFromCache;
|
||||||
|
using Routine<T>::TestVectorX;
|
||||||
|
using Routine<T>::TestVectorIndex;
|
||||||
|
using Routine<T>::RunKernel;
|
||||||
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoAmax(const size_t n,
|
||||||
|
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Static variable to get the precision
|
||||||
|
const static Precision precision_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XAMAX_H_
|
||||||
|
#endif
|
56
include/internal/routines/level1/xasum.h
Normal file
56
include/internal/routines/level1/xasum.h
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xasum routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XASUM_H_
|
||||||
|
#define CLBLAST_ROUTINES_XASUM_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xasum: public Routine<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Members and methods from the base class
|
||||||
|
using Routine<T>::db_;
|
||||||
|
using Routine<T>::source_string_;
|
||||||
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
|
using Routine<T>::context_;
|
||||||
|
using Routine<T>::GetProgramFromCache;
|
||||||
|
using Routine<T>::TestVectorX;
|
||||||
|
using Routine<T>::TestVectorDot;
|
||||||
|
using Routine<T>::RunKernel;
|
||||||
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoAsum(const size_t n,
|
||||||
|
const Buffer<T> &asum_buffer, const size_t asum_offset,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Static variable to get the precision
|
||||||
|
const static Precision precision_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XASUM_H_
|
||||||
|
#endif
|
|
@ -28,6 +28,7 @@ class Xaxpy: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::TestVectorX;
|
using Routine<T>::TestVectorX;
|
||||||
using Routine<T>::TestVectorY;
|
using Routine<T>::TestVectorY;
|
||||||
|
@ -35,7 +36,7 @@ class Xaxpy: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xaxpy(Queue &queue, Event &event, const std::string &name = "AXPY");
|
Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoAxpy(const size_t n, const T alpha,
|
StatusCode DoAxpy(const size_t n, const T alpha,
|
||||||
|
|
|
@ -28,6 +28,7 @@ class Xcopy: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::TestVectorX;
|
using Routine<T>::TestVectorX;
|
||||||
using Routine<T>::TestVectorY;
|
using Routine<T>::TestVectorY;
|
||||||
|
@ -35,7 +36,7 @@ class Xcopy: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xcopy(Queue &queue, Event &event, const std::string &name = "COPY");
|
Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoCopy(const size_t n,
|
StatusCode DoCopy(const size_t n,
|
||||||
|
|
|
@ -28,6 +28,7 @@ class Xdot: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::context_;
|
using Routine<T>::context_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::TestVectorX;
|
using Routine<T>::TestVectorX;
|
||||||
|
@ -37,7 +38,7 @@ class Xdot: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xdot(Queue &queue, Event &event, const std::string &name = "DOT");
|
Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoDot(const size_t n,
|
StatusCode DoDot(const size_t n,
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Xdotc: public Xdot<T> {
|
||||||
using Xdot<T>::DoDot;
|
using Xdot<T>::DoDot;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xdotc(Queue &queue, Event &event, const std::string &name = "DOTC");
|
Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoDotc(const size_t n,
|
StatusCode DoDotc(const size_t n,
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Xdotu: public Xdot<T> {
|
||||||
using Xdot<T>::DoDot;
|
using Xdot<T>::DoDot;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xdotu(Queue &queue, Event &event, const std::string &name = "DOTU");
|
Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoDotu(const size_t n,
|
StatusCode DoDotu(const size_t n,
|
||||||
|
|
49
include/internal/routines/level1/xmax.h
Normal file
49
include/internal/routines/level1/xmax.h
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xmax routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XMAX_H_
|
||||||
|
#define CLBLAST_ROUTINES_XMAX_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
#include "internal/routines/level1/xamax.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xmax: public Xamax<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Members and methods from the base class
|
||||||
|
using Xamax<T>::DoAmax;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xmax(Queue &queue, EventPointer event, const std::string &name = "MAX"):
|
||||||
|
Xamax<T>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forwards to the regular absolute version. The implementation difference is realised in the
|
||||||
|
// kernel through a pre-processor macro based on the name of the routine.
|
||||||
|
StatusCode DoMax(const size_t n,
|
||||||
|
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
|
||||||
|
return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XMAX_H_
|
||||||
|
#endif
|
49
include/internal/routines/level1/xmin.h
Normal file
49
include/internal/routines/level1/xmin.h
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xmin routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XMIN_H_
|
||||||
|
#define CLBLAST_ROUTINES_XMIN_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
#include "internal/routines/level1/xamax.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xmin: public Xamax<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Members and methods from the base class
|
||||||
|
using Xamax<T>::DoAmax;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xmin(Queue &queue, EventPointer event, const std::string &name = "MIN"):
|
||||||
|
Xamax<T>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forwards to the regular max-absolute version. The implementation difference is realised in the
|
||||||
|
// kernel through a pre-processor macro based on the name of the routine.
|
||||||
|
StatusCode DoMin(const size_t n,
|
||||||
|
const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
|
||||||
|
return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XMIN_H_
|
||||||
|
#endif
|
56
include/internal/routines/level1/xnrm2.h
Normal file
56
include/internal/routines/level1/xnrm2.h
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xnrm2 routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XNRM2_H_
|
||||||
|
#define CLBLAST_ROUTINES_XNRM2_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xnrm2: public Routine<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Members and methods from the base class
|
||||||
|
using Routine<T>::db_;
|
||||||
|
using Routine<T>::source_string_;
|
||||||
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
|
using Routine<T>::context_;
|
||||||
|
using Routine<T>::GetProgramFromCache;
|
||||||
|
using Routine<T>::TestVectorX;
|
||||||
|
using Routine<T>::TestVectorDot;
|
||||||
|
using Routine<T>::RunKernel;
|
||||||
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
|
||||||
|
|
||||||
|
// Templated-precision implementation of the routine
|
||||||
|
StatusCode DoNrm2(const size_t n,
|
||||||
|
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Static variable to get the precision
|
||||||
|
const static Precision precision_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XNRM2_H_
|
||||||
|
#endif
|
|
@ -28,13 +28,14 @@ class Xscal: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::TestVectorX;
|
using Routine<T>::TestVectorX;
|
||||||
using Routine<T>::RunKernel;
|
using Routine<T>::RunKernel;
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xscal(Queue &queue, Event &event, const std::string &name = "SCAL");
|
Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoScal(const size_t n, const T alpha,
|
StatusCode DoScal(const size_t n, const T alpha,
|
||||||
|
|
49
include/internal/routines/level1/xsum.h
Normal file
49
include/internal/routines/level1/xsum.h
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xsum routine. The precision is implemented using a template argument.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#ifndef CLBLAST_ROUTINES_XSUM_H_
|
||||||
|
#define CLBLAST_ROUTINES_XSUM_H_
|
||||||
|
|
||||||
|
#include "internal/routine.h"
|
||||||
|
#include "internal/routines/level1/xasum.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// See comment at top of file for a description of the class
|
||||||
|
template <typename T>
|
||||||
|
class Xsum: public Xasum<T> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
// Members and methods from the base class
|
||||||
|
using Xasum<T>::DoAsum;
|
||||||
|
|
||||||
|
// Constructor
|
||||||
|
Xsum(Queue &queue, EventPointer event, const std::string &name = "SUM"):
|
||||||
|
Xasum<T>(queue, event, name) {
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forwards to the regular absolute version. The implementation difference is realised in the
|
||||||
|
// kernel through a pre-processor macro based on the name of the routine.
|
||||||
|
StatusCode DoSum(const size_t n,
|
||||||
|
const Buffer<T> &sum_buffer, const size_t sum_offset,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
|
||||||
|
return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
||||||
|
|
||||||
|
// CLBLAST_ROUTINES_XSUM_H_
|
||||||
|
#endif
|
|
@ -28,6 +28,7 @@ class Xswap: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::TestVectorX;
|
using Routine<T>::TestVectorX;
|
||||||
using Routine<T>::TestVectorY;
|
using Routine<T>::TestVectorY;
|
||||||
|
@ -35,7 +36,7 @@ class Xswap: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xswap(Queue &queue, Event &event, const std::string &name = "SWAP");
|
Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSwap(const size_t n,
|
StatusCode DoSwap(const size_t n,
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Xgbmv: public Xgemv<T> {
|
||||||
using Xgemv<T>::MatVec;
|
using Xgemv<T>::MatVec;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xgbmv(Queue &queue, Event &event, const std::string &name = "GBMV");
|
Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
|
StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
|
||||||
|
|
|
@ -28,6 +28,7 @@ class Xgemv: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::TestVectorX;
|
using Routine<T>::TestVectorX;
|
||||||
using Routine<T>::TestVectorY;
|
using Routine<T>::TestVectorY;
|
||||||
|
@ -37,7 +38,7 @@ class Xgemv: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xgemv(Queue &queue, Event &event, const std::string &name = "GEMV");
|
Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
|
StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
|
||||||
|
|
|
@ -28,6 +28,7 @@ class Xger: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::TestVectorX;
|
using Routine<T>::TestVectorX;
|
||||||
using Routine<T>::TestVectorY;
|
using Routine<T>::TestVectorY;
|
||||||
|
@ -36,7 +37,7 @@ class Xger: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xger(Queue &queue, Event &event, const std::string &name = "GER");
|
Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoGer(const Layout layout,
|
StatusCode DoGer(const Layout layout,
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Xgerc: public Xger<T> {
|
||||||
using Xger<T>::DoGer;
|
using Xger<T>::DoGer;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xgerc(Queue &queue, Event &event, const std::string &name = "GERC");
|
Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoGerc(const Layout layout,
|
StatusCode DoGerc(const Layout layout,
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Xgeru: public Xger<T> {
|
||||||
using Xger<T>::DoGer;
|
using Xger<T>::DoGer;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xgeru(Queue &queue, Event &event, const std::string &name = "GERU");
|
Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoGeru(const Layout layout,
|
StatusCode DoGeru(const Layout layout,
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Xhbmv: public Xgemv<T> {
|
||||||
using Xgemv<T>::MatVec;
|
using Xgemv<T>::MatVec;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xhbmv(Queue &queue, Event &event, const std::string &name = "HBMV");
|
Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoHbmv(const Layout layout, const Triangle triangle,
|
StatusCode DoHbmv(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Xhemv: public Xgemv<T> {
|
||||||
using Xgemv<T>::MatVec;
|
using Xgemv<T>::MatVec;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xhemv(Queue &queue, Event &event, const std::string &name = "HEMV");
|
Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoHemv(const Layout layout, const Triangle triangle,
|
StatusCode DoHemv(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -28,6 +28,7 @@ class Xher: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::TestVectorX;
|
using Routine<T>::TestVectorX;
|
||||||
using Routine<T>::TestMatrixA;
|
using Routine<T>::TestMatrixA;
|
||||||
|
@ -36,7 +37,7 @@ class Xher: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xher(Queue &queue, Event &event, const std::string &name = "HER");
|
Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
|
||||||
|
|
||||||
// Translates alpha of type 'U' into type 'T'
|
// Translates alpha of type 'U' into type 'T'
|
||||||
T GetAlpha(const U alpha);
|
T GetAlpha(const U alpha);
|
||||||
|
|
|
@ -28,6 +28,7 @@ class Xher2: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::TestVectorX;
|
using Routine<T>::TestVectorX;
|
||||||
using Routine<T>::TestVectorY;
|
using Routine<T>::TestVectorY;
|
||||||
|
@ -37,7 +38,7 @@ class Xher2: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xher2(Queue &queue, Event &event, const std::string &name = "HER2");
|
Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoHer2(const Layout layout, const Triangle triangle,
|
StatusCode DoHer2(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Xhpmv: public Xgemv<T> {
|
||||||
using Xgemv<T>::MatVec;
|
using Xgemv<T>::MatVec;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xhpmv(Queue &queue, Event &event, const std::string &name = "HPMV");
|
Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoHpmv(const Layout layout, const Triangle triangle,
|
StatusCode DoHpmv(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Xhpr: public Xher<T,U> {
|
||||||
using Xher<T,U>::DoHer;
|
using Xher<T,U>::DoHer;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xhpr(Queue &queue, Event &event, const std::string &name = "HPR");
|
Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoHpr(const Layout layout, const Triangle triangle,
|
StatusCode DoHpr(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Xhpr2: public Xher2<T> {
|
||||||
using Xher2<T>::DoHer2;
|
using Xher2<T>::DoHer2;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xhpr2(Queue &queue, Event &event, const std::string &name = "HPR2");
|
Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoHpr2(const Layout layout, const Triangle triangle,
|
StatusCode DoHpr2(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Xsbmv: public Xgemv<T> {
|
||||||
using Xgemv<T>::MatVec;
|
using Xgemv<T>::MatVec;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xsbmv(Queue &queue, Event &event, const std::string &name = "SBMV");
|
Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSbmv(const Layout layout, const Triangle triangle,
|
StatusCode DoSbmv(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Xspmv: public Xgemv<T> {
|
||||||
using Xgemv<T>::MatVec;
|
using Xgemv<T>::MatVec;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xspmv(Queue &queue, Event &event, const std::string &name = "SPMV");
|
Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSpmv(const Layout layout, const Triangle triangle,
|
StatusCode DoSpmv(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Xspr: public Xher<T,T> {
|
||||||
using Xher<T,T>::DoHer;
|
using Xher<T,T>::DoHer;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xspr(Queue &queue, Event &event, const std::string &name = "SPR");
|
Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSpr(const Layout layout, const Triangle triangle,
|
StatusCode DoSpr(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Xspr2: public Xher2<T> {
|
||||||
using Xher2<T>::DoHer2;
|
using Xher2<T>::DoHer2;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xspr2(Queue &queue, Event &event, const std::string &name = "SPR2");
|
Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSpr2(const Layout layout, const Triangle triangle,
|
StatusCode DoSpr2(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Xsymv: public Xgemv<T> {
|
||||||
using Xgemv<T>::MatVec;
|
using Xgemv<T>::MatVec;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xsymv(Queue &queue, Event &event, const std::string &name = "SYMV");
|
Xsymv(Queue &queue, EventPointer event, const std::string &name = "SYMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSymv(const Layout layout, const Triangle triangle,
|
StatusCode DoSymv(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Xsyr: public Xher<T,T> {
|
||||||
using Xher<T,T>::DoHer;
|
using Xher<T,T>::DoHer;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xsyr(Queue &queue, Event &event, const std::string &name = "SYR");
|
Xsyr(Queue &queue, EventPointer event, const std::string &name = "SYR");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSyr(const Layout layout, const Triangle triangle,
|
StatusCode DoSyr(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -28,7 +28,7 @@ class Xsyr2: public Xher2<T> {
|
||||||
using Xher2<T>::DoHer2;
|
using Xher2<T>::DoHer2;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xsyr2(Queue &queue, Event &event, const std::string &name = "SYR2");
|
Xsyr2(Queue &queue, EventPointer event, const std::string &name = "SYR2");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSyr2(const Layout layout, const Triangle triangle,
|
StatusCode DoSyr2(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -34,7 +34,7 @@ class Xtbmv: public Xgemv<T> {
|
||||||
using Xgemv<T>::MatVec;
|
using Xgemv<T>::MatVec;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xtbmv(Queue &queue, Event &event, const std::string &name = "TBMV");
|
Xtbmv(Queue &queue, EventPointer event, const std::string &name = "TBMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoTbmv(const Layout layout, const Triangle triangle,
|
StatusCode DoTbmv(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -34,7 +34,7 @@ class Xtpmv: public Xgemv<T> {
|
||||||
using Xgemv<T>::MatVec;
|
using Xgemv<T>::MatVec;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xtpmv(Queue &queue, Event &event, const std::string &name = "TPMV");
|
Xtpmv(Queue &queue, EventPointer event, const std::string &name = "TPMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoTpmv(const Layout layout, const Triangle triangle,
|
StatusCode DoTpmv(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -34,7 +34,7 @@ class Xtrmv: public Xgemv<T> {
|
||||||
using Xgemv<T>::MatVec;
|
using Xgemv<T>::MatVec;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xtrmv(Queue &queue, Event &event, const std::string &name = "TRMV");
|
Xtrmv(Queue &queue, EventPointer event, const std::string &name = "TRMV");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoTrmv(const Layout layout, const Triangle triangle,
|
StatusCode DoTrmv(const Layout layout, const Triangle triangle,
|
||||||
|
|
|
@ -28,6 +28,7 @@ class Xgemm: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::context_;
|
using Routine<T>::context_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::PadCopyTransposeMatrix;
|
using Routine<T>::PadCopyTransposeMatrix;
|
||||||
|
@ -38,7 +39,7 @@ class Xgemm: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xgemm(Queue &queue, Event &event, const std::string &name = "GEMM");
|
Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
||||||
|
|
|
@ -37,7 +37,7 @@ class Xhemm: public Xgemm<T> {
|
||||||
using Xgemm<T>::DoGemm;
|
using Xgemm<T>::DoGemm;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xhemm(Queue &queue, Event &event, const std::string &name = "HEMM");
|
Xhemm(Queue &queue, EventPointer event, const std::string &name = "HEMM");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
|
StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
|
|
@ -30,6 +30,7 @@ class Xher2k: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::context_;
|
using Routine<T>::context_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::PadCopyTransposeMatrix;
|
using Routine<T>::PadCopyTransposeMatrix;
|
||||||
|
@ -40,7 +41,7 @@ class Xher2k: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xher2k(Queue &queue, Event &event, const std::string &name = "HER2K");
|
Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||||
|
|
|
@ -30,6 +30,7 @@ class Xherk: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::context_;
|
using Routine<T>::context_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::PadCopyTransposeMatrix;
|
using Routine<T>::PadCopyTransposeMatrix;
|
||||||
|
@ -39,7 +40,7 @@ class Xherk: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xherk(Queue &queue, Event &event, const std::string &name = "HERK");
|
Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||||
|
|
|
@ -39,7 +39,7 @@ class Xsymm: public Xgemm<T> {
|
||||||
using Xgemm<T>::DoGemm;
|
using Xgemm<T>::DoGemm;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xsymm(Queue &queue, Event &event, const std::string &name = "SYMM");
|
Xsymm(Queue &queue, EventPointer event, const std::string &name = "SYMM");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
|
StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
|
|
@ -30,6 +30,7 @@ class Xsyr2k: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::context_;
|
using Routine<T>::context_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::PadCopyTransposeMatrix;
|
using Routine<T>::PadCopyTransposeMatrix;
|
||||||
|
@ -40,7 +41,7 @@ class Xsyr2k: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xsyr2k(Queue &queue, Event &event, const std::string &name = "SYR2K");
|
Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||||
|
|
|
@ -32,6 +32,7 @@ class Xsyrk: public Routine<T> {
|
||||||
using Routine<T>::db_;
|
using Routine<T>::db_;
|
||||||
using Routine<T>::source_string_;
|
using Routine<T>::source_string_;
|
||||||
using Routine<T>::queue_;
|
using Routine<T>::queue_;
|
||||||
|
using Routine<T>::event_;
|
||||||
using Routine<T>::context_;
|
using Routine<T>::context_;
|
||||||
using Routine<T>::GetProgramFromCache;
|
using Routine<T>::GetProgramFromCache;
|
||||||
using Routine<T>::PadCopyTransposeMatrix;
|
using Routine<T>::PadCopyTransposeMatrix;
|
||||||
|
@ -41,7 +42,7 @@ class Xsyrk: public Routine<T> {
|
||||||
using Routine<T>::ErrorIn;
|
using Routine<T>::ErrorIn;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xsyrk(Queue &queue, Event &event, const std::string &name = "SYRK");
|
Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||||
|
|
|
@ -38,7 +38,7 @@ class Xtrmm: public Xgemm<T> {
|
||||||
using Xgemm<T>::DoGemm;
|
using Xgemm<T>::DoGemm;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
Xtrmm(Queue &queue, Event &event, const std::string &name = "TRMM");
|
Xtrmm(Queue &queue, EventPointer event, const std::string &name = "TRMM");
|
||||||
|
|
||||||
// Templated-precision implementation of the routine
|
// Templated-precision implementation of the routine
|
||||||
StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
|
StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle,
|
||||||
|
|
|
@ -35,6 +35,9 @@ using double2 = std::complex<double>;
|
||||||
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
|
const std::string kKhronosHalfPrecision = "cl_khr_fp16";
|
||||||
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
|
const std::string kKhronosDoublePrecision = "cl_khr_fp64";
|
||||||
|
|
||||||
|
// Catched an unknown error
|
||||||
|
constexpr auto kUnknownError = -999;
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// The routine-specific arguments in string form
|
// The routine-specific arguments in string form
|
||||||
|
@ -61,6 +64,9 @@ constexpr auto kArgBOffset = "offb";
|
||||||
constexpr auto kArgCOffset = "offc";
|
constexpr auto kArgCOffset = "offc";
|
||||||
constexpr auto kArgAPOffset = "offap";
|
constexpr auto kArgAPOffset = "offap";
|
||||||
constexpr auto kArgDotOffset = "offdot";
|
constexpr auto kArgDotOffset = "offdot";
|
||||||
|
constexpr auto kArgNrm2Offset = "offnrm2";
|
||||||
|
constexpr auto kArgAsumOffset = "offasum";
|
||||||
|
constexpr auto kArgImaxOffset = "offimax";
|
||||||
constexpr auto kArgAlpha = "alpha";
|
constexpr auto kArgAlpha = "alpha";
|
||||||
constexpr auto kArgBeta = "beta";
|
constexpr auto kArgBeta = "beta";
|
||||||
|
|
||||||
|
@ -69,12 +75,14 @@ constexpr auto kArgFraction = "fraction";
|
||||||
|
|
||||||
// The client-specific arguments in string form
|
// The client-specific arguments in string form
|
||||||
constexpr auto kArgCompareclblas = "clblas";
|
constexpr auto kArgCompareclblas = "clblas";
|
||||||
|
constexpr auto kArgComparecblas = "cblas";
|
||||||
constexpr auto kArgStepSize = "step";
|
constexpr auto kArgStepSize = "step";
|
||||||
constexpr auto kArgNumSteps = "num_steps";
|
constexpr auto kArgNumSteps = "num_steps";
|
||||||
constexpr auto kArgNumRuns = "runs";
|
constexpr auto kArgNumRuns = "runs";
|
||||||
|
|
||||||
// The client-specific arguments in string form
|
// The client-specific arguments in string form
|
||||||
constexpr auto kArgFullTest = "full_test";
|
constexpr auto kArgFullTest = "full_test";
|
||||||
|
constexpr auto kArgVerbose = "verbose";
|
||||||
|
|
||||||
// The common arguments in string form
|
// The common arguments in string form
|
||||||
constexpr auto kArgPlatform = "platform";
|
constexpr auto kArgPlatform = "platform";
|
||||||
|
@ -113,6 +121,9 @@ struct Arguments {
|
||||||
size_t c_offset = 0;
|
size_t c_offset = 0;
|
||||||
size_t ap_offset = 0;
|
size_t ap_offset = 0;
|
||||||
size_t dot_offset = 0;
|
size_t dot_offset = 0;
|
||||||
|
size_t nrm2_offset = 0;
|
||||||
|
size_t asum_offset = 0;
|
||||||
|
size_t imax_offset = 0;
|
||||||
T alpha = T{1.0};
|
T alpha = T{1.0};
|
||||||
T beta = T{1.0};
|
T beta = T{1.0};
|
||||||
size_t x_size = 1;
|
size_t x_size = 1;
|
||||||
|
@ -121,16 +132,15 @@ struct Arguments {
|
||||||
size_t b_size = 1;
|
size_t b_size = 1;
|
||||||
size_t c_size = 1;
|
size_t c_size = 1;
|
||||||
size_t ap_size = 1;
|
size_t ap_size = 1;
|
||||||
size_t dot_size = 1;
|
size_t scalar_size = 1;
|
||||||
// Tuner-specific arguments
|
// Tuner-specific arguments
|
||||||
double fraction = 1.0;
|
double fraction = 1.0;
|
||||||
// Client-specific arguments
|
// Client-specific arguments
|
||||||
int compare_clblas = 1;
|
int compare_clblas = 1;
|
||||||
|
int compare_cblas = 1;
|
||||||
size_t step = 1;
|
size_t step = 1;
|
||||||
size_t num_steps = 0;
|
size_t num_steps = 0;
|
||||||
size_t num_runs = 10;
|
size_t num_runs = 10;
|
||||||
// Tester-specific arguments
|
|
||||||
bool full_test = false;
|
|
||||||
// Common arguments
|
// Common arguments
|
||||||
size_t platform_id = 0;
|
size_t platform_id = 0;
|
||||||
size_t device_id = 0;
|
size_t device_id = 0;
|
||||||
|
@ -149,7 +159,7 @@ struct Buffers {
|
||||||
Buffer<T> b_mat;
|
Buffer<T> b_mat;
|
||||||
Buffer<T> c_mat;
|
Buffer<T> c_mat;
|
||||||
Buffer<T> ap_mat;
|
Buffer<T> ap_mat;
|
||||||
Buffer<T> dot;
|
Buffer<T> scalar;
|
||||||
};
|
};
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
133
samples/cache.c
Normal file
133
samples/cache.c
Normal file
|
@ -0,0 +1,133 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file demonstrates the CLBlast kernel cache, which stores compiled OpenCL binaries for faster
|
||||||
|
// repeated kernel execution. The cache can be pre-initialized or cleared.
|
||||||
|
//
|
||||||
|
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
|
||||||
|
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
// Includes the CLBlast library (C interface)
|
||||||
|
#include <clblast_c.h>
|
||||||
|
|
||||||
|
// Forward declaration
|
||||||
|
void run_example_routine(const cl_device_id device);
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Example use of the CLBlast kernel cache
|
||||||
|
int main(void) {
|
||||||
|
|
||||||
|
// OpenCL platform/device settings
|
||||||
|
const size_t platform_id = 0;
|
||||||
|
const size_t device_id = 0;
|
||||||
|
|
||||||
|
// Initializes the OpenCL platform
|
||||||
|
cl_uint num_platforms;
|
||||||
|
clGetPlatformIDs(0, NULL, &num_platforms);
|
||||||
|
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
|
||||||
|
clGetPlatformIDs(num_platforms, platforms, NULL);
|
||||||
|
cl_platform_id platform = platforms[platform_id];
|
||||||
|
|
||||||
|
// Initializes the OpenCL device
|
||||||
|
cl_uint num_devices;
|
||||||
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
||||||
|
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
||||||
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
||||||
|
cl_device_id device = devices[device_id];
|
||||||
|
|
||||||
|
// Run the routine multiple times in a row: after the first time the binary is already in the
|
||||||
|
// cache and compilation is no longer needed.
|
||||||
|
printf("Starting caching sample with an empty cache\n");
|
||||||
|
run_example_routine(device);
|
||||||
|
run_example_routine(device);
|
||||||
|
run_example_routine(device);
|
||||||
|
|
||||||
|
// Clearing the cache makes CLBlast re-compile the kernel once
|
||||||
|
printf("Clearing cache\n");
|
||||||
|
CLBlastClearCache();
|
||||||
|
run_example_routine(device);
|
||||||
|
run_example_routine(device);
|
||||||
|
|
||||||
|
// When the cache is empty, it can be pre-initialized with compiled kernels for all routines by
|
||||||
|
// calling the CLBlastFillCache function, such that all other CLBlast calls can benefit from
|
||||||
|
// pre-compiled kernels and thus execute at maximum speed.
|
||||||
|
printf("Clearing cache\n");
|
||||||
|
CLBlastClearCache();
|
||||||
|
printf("Filling cache (this might take a while)\n");
|
||||||
|
CLBlastFillCache(device);
|
||||||
|
run_example_routine(device);
|
||||||
|
|
||||||
|
// Clean-up
|
||||||
|
free(platforms);
|
||||||
|
free(devices);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Runs an example routine and reports the time
|
||||||
|
void run_example_routine(const cl_device_id device) {
|
||||||
|
|
||||||
|
// Example SASUM arguments
|
||||||
|
const size_t n = 1024*128;
|
||||||
|
|
||||||
|
// Creates the OpenCL context, queue, and an event
|
||||||
|
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
|
||||||
|
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
|
||||||
|
cl_event event = NULL;
|
||||||
|
|
||||||
|
// Populate host data structures with some example data
|
||||||
|
float* host_input = (float*)malloc(sizeof(float)*n);
|
||||||
|
float* host_output = (float*)malloc(sizeof(float)*1);
|
||||||
|
for (size_t i=0; i<n; ++i) { host_input[i] = -1.5f; }
|
||||||
|
for (size_t i=0; i<1; ++i) { host_output[i] = 0.0f; }
|
||||||
|
|
||||||
|
// Copy the data-structures to the device
|
||||||
|
cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
|
||||||
|
cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(float), NULL, NULL);
|
||||||
|
clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
|
||||||
|
clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
|
||||||
|
|
||||||
|
// Start the timer
|
||||||
|
clock_t start = clock();
|
||||||
|
|
||||||
|
// Calls an example routine
|
||||||
|
StatusCode status = CLBlastSasum(n,
|
||||||
|
device_output, 0,
|
||||||
|
device_input, 0, 1,
|
||||||
|
&queue, &event);
|
||||||
|
|
||||||
|
// Wait for completion
|
||||||
|
clWaitForEvents(1, &event);
|
||||||
|
|
||||||
|
// Retrieves the execution time
|
||||||
|
clock_t diff = clock() - start;
|
||||||
|
double time_ms = diff * 1000.0f / (double)CLOCKS_PER_SEC;
|
||||||
|
|
||||||
|
// Routine completed. See "clblast_c.h" for status codes (0 -> success).
|
||||||
|
printf("Completed routine with status %d in %.3lf ms\n", status, time_ms);
|
||||||
|
|
||||||
|
// Clean-up
|
||||||
|
free(host_input);
|
||||||
|
free(host_output);
|
||||||
|
clReleaseMemObject(device_input);
|
||||||
|
clReleaseMemObject(device_output);
|
||||||
|
clReleaseCommandQueue(queue);
|
||||||
|
clReleaseContext(context);
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
106
samples/dgemv.c
Normal file
106
samples/dgemv.c
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file demonstrates the use of the DGEMV routine. It is pure C99 and demonstrates the use of
|
||||||
|
// the C API to the CLBlast library.
|
||||||
|
//
|
||||||
|
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
|
||||||
|
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
// Includes the CLBlast library (C interface)
|
||||||
|
#include <clblast_c.h>
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Example use of the double-precision routine DGEMV
|
||||||
|
int main(void) {
|
||||||
|
|
||||||
|
// OpenCL platform/device settings
|
||||||
|
const size_t platform_id = 0;
|
||||||
|
const size_t device_id = 0;
|
||||||
|
|
||||||
|
// Example DGEMV arguments
|
||||||
|
const size_t m = 128;
|
||||||
|
const size_t n = 289;
|
||||||
|
const double alpha = 0.7;
|
||||||
|
const double beta = 0.0;
|
||||||
|
const size_t a_ld = n;
|
||||||
|
|
||||||
|
// Initializes the OpenCL platform
|
||||||
|
cl_uint num_platforms;
|
||||||
|
clGetPlatformIDs(0, NULL, &num_platforms);
|
||||||
|
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
|
||||||
|
clGetPlatformIDs(num_platforms, platforms, NULL);
|
||||||
|
cl_platform_id platform = platforms[platform_id];
|
||||||
|
|
||||||
|
// Initializes the OpenCL device
|
||||||
|
cl_uint num_devices;
|
||||||
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
||||||
|
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
||||||
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
||||||
|
cl_device_id device = devices[device_id];
|
||||||
|
|
||||||
|
// Creates the OpenCL context, queue, and an event
|
||||||
|
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
|
||||||
|
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
|
||||||
|
cl_event event = NULL;
|
||||||
|
|
||||||
|
// Populate host data structures with some example data
|
||||||
|
double* host_a = (double*)malloc(sizeof(double)*m*n);
|
||||||
|
double* host_x = (double*)malloc(sizeof(double)*n);
|
||||||
|
double* host_y = (double*)malloc(sizeof(double)*m);
|
||||||
|
for (size_t i=0; i<m*n; ++i) { host_a[i] = 12.193; }
|
||||||
|
for (size_t i=0; i<n; ++i) { host_x[i] = -8.199; }
|
||||||
|
for (size_t i=0; i<m; ++i) { host_y[i] = 0.0; }
|
||||||
|
|
||||||
|
// Copy the data-structures to the device
|
||||||
|
cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, m*n*sizeof(double), NULL, NULL);
|
||||||
|
cl_mem device_x = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, NULL);
|
||||||
|
cl_mem device_y = clCreateBuffer(context, CL_MEM_READ_WRITE, m*sizeof(double), NULL, NULL);
|
||||||
|
clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, m*n*sizeof(double), host_a, 0, NULL, NULL);
|
||||||
|
clEnqueueWriteBuffer(queue, device_x, CL_TRUE, 0, n*sizeof(double), host_x, 0, NULL, NULL);
|
||||||
|
clEnqueueWriteBuffer(queue, device_y, CL_TRUE, 0, m*sizeof(double), host_y, 0, NULL, NULL);
|
||||||
|
|
||||||
|
// Call the DGEMV routine.
|
||||||
|
StatusCode status = CLBlastDgemv(kRowMajor, kNo,
|
||||||
|
m, n,
|
||||||
|
alpha,
|
||||||
|
device_a, 0, a_ld,
|
||||||
|
device_x, 0, 1,
|
||||||
|
beta,
|
||||||
|
device_y, 0, 1,
|
||||||
|
&queue, &event);
|
||||||
|
|
||||||
|
// Wait for completion
|
||||||
|
clWaitForEvents(1, &event);
|
||||||
|
|
||||||
|
// Example completed. See "clblast_c.h" for status codes (0 -> success).
|
||||||
|
printf("Completed DGEMV with status %d\n", status);
|
||||||
|
|
||||||
|
// Clean-up
|
||||||
|
free(platforms);
|
||||||
|
free(devices);
|
||||||
|
free(host_a);
|
||||||
|
free(host_x);
|
||||||
|
free(host_y);
|
||||||
|
clReleaseMemObject(device_a);
|
||||||
|
clReleaseMemObject(device_x);
|
||||||
|
clReleaseMemObject(device_y);
|
||||||
|
clReleaseCommandQueue(queue);
|
||||||
|
clReleaseContext(context);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
96
samples/sasum.c
Normal file
96
samples/sasum.c
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file demonstrates the use of the SASUM routine. It is pure C99 and demonstrates the use of
|
||||||
|
// the C API to the CLBlast library.
|
||||||
|
//
|
||||||
|
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
|
||||||
|
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
// Includes the CLBlast library (C interface)
|
||||||
|
#include <clblast_c.h>
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Example use of the single-precision routine SASUM
|
||||||
|
int main(void) {
|
||||||
|
|
||||||
|
// OpenCL platform/device settings
|
||||||
|
const size_t platform_id = 0;
|
||||||
|
const size_t device_id = 0;
|
||||||
|
|
||||||
|
// Example SASUM arguments
|
||||||
|
const size_t n = 1000;
|
||||||
|
const float input_value = -1.5f;
|
||||||
|
|
||||||
|
// Initializes the OpenCL platform
|
||||||
|
cl_uint num_platforms;
|
||||||
|
clGetPlatformIDs(0, NULL, &num_platforms);
|
||||||
|
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
|
||||||
|
clGetPlatformIDs(num_platforms, platforms, NULL);
|
||||||
|
cl_platform_id platform = platforms[platform_id];
|
||||||
|
|
||||||
|
// Initializes the OpenCL device
|
||||||
|
cl_uint num_devices;
|
||||||
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
||||||
|
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
||||||
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
||||||
|
cl_device_id device = devices[device_id];
|
||||||
|
|
||||||
|
// Creates the OpenCL context, queue, and an event
|
||||||
|
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
|
||||||
|
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
|
||||||
|
cl_event event = NULL;
|
||||||
|
|
||||||
|
// Populate host data structures with some example data
|
||||||
|
float* host_input = (float*)malloc(sizeof(float)*n);
|
||||||
|
float* host_output = (float*)malloc(sizeof(float)*1);
|
||||||
|
for (size_t i=0; i<n; ++i) { host_input[i] = input_value; }
|
||||||
|
for (size_t i=0; i<1; ++i) { host_output[i] = 0.0f; }
|
||||||
|
|
||||||
|
// Copy the data-structures to the device
|
||||||
|
cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
|
||||||
|
cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(float), NULL, NULL);
|
||||||
|
clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
|
||||||
|
clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
|
||||||
|
|
||||||
|
// Call the SASUM routine.
|
||||||
|
StatusCode status = CLBlastSasum(n,
|
||||||
|
device_output, 0,
|
||||||
|
device_input, 0, 1,
|
||||||
|
&queue, &event);
|
||||||
|
|
||||||
|
// Wait for completion
|
||||||
|
clWaitForEvents(1, &event);
|
||||||
|
|
||||||
|
// Copies the result back to the host
|
||||||
|
clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
|
||||||
|
|
||||||
|
// Example completed. See "clblast_c.h" for status codes (0 -> success).
|
||||||
|
printf("Completed SASUM with status %d: %zu * |%.1lf| = %.1lf\n", status, n, input_value, host_output[0]);
|
||||||
|
|
||||||
|
// Clean-up
|
||||||
|
free(platforms);
|
||||||
|
free(devices);
|
||||||
|
free(host_input);
|
||||||
|
free(host_output);
|
||||||
|
clReleaseMemObject(device_input);
|
||||||
|
clReleaseMemObject(device_output);
|
||||||
|
clReleaseCommandQueue(queue);
|
||||||
|
clReleaseContext(context);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
|
@ -15,6 +15,7 @@
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
@ -47,11 +48,11 @@ int main(void) {
|
||||||
clGetPlatformIDs(num_platforms, platforms, NULL);
|
clGetPlatformIDs(num_platforms, platforms, NULL);
|
||||||
cl_platform_id platform = platforms[platform_id];
|
cl_platform_id platform = platforms[platform_id];
|
||||||
|
|
||||||
// Initializes the OpenCL device (note: example for GPU devices only)
|
// Initializes the OpenCL device
|
||||||
cl_uint num_devices;
|
cl_uint num_devices;
|
||||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
||||||
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
||||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
||||||
cl_device_id device = devices[device_id];
|
cl_device_id device = devices[device_id];
|
||||||
|
|
||||||
// Creates the OpenCL context, queue, and an event
|
// Creates the OpenCL context, queue, and an event
|
||||||
|
@ -89,7 +90,7 @@ int main(void) {
|
||||||
clWaitForEvents(1, &event);
|
clWaitForEvents(1, &event);
|
||||||
|
|
||||||
// Example completed. See "clblast_c.h" for status codes (0 -> success).
|
// Example completed. See "clblast_c.h" for status codes (0 -> success).
|
||||||
printf("Completed with status %d\n", status);
|
printf("Completed SGEMM with status %d\n", status);
|
||||||
|
|
||||||
// Clean-up
|
// Clean-up
|
||||||
free(platforms);
|
free(platforms);
|
||||||
|
|
|
@ -8,8 +8,8 @@
|
||||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
//
|
//
|
||||||
// This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does
|
// This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does
|
||||||
// requires the Khronos C++ OpenCL API header file (not included). The example uses C++ features,
|
// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++
|
||||||
// but CLBlast can also be used using the regular C-style OpenCL API.
|
// features, but CLBlast can also be used using the regular C-style OpenCL API.
|
||||||
//
|
//
|
||||||
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
|
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
|
||||||
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
|
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
|
||||||
|
@ -22,7 +22,7 @@
|
||||||
|
|
||||||
// Includes the C++ OpenCL API. If not yet available, it can be found here:
|
// Includes the C++ OpenCL API. If not yet available, it can be found here:
|
||||||
// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
|
// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
|
||||||
#include <cl.hpp>
|
#include "cl.hpp"
|
||||||
|
|
||||||
// Includes the CLBlast library
|
// Includes the CLBlast library
|
||||||
#include <clblast.h>
|
#include <clblast.h>
|
||||||
|
@ -52,16 +52,16 @@ int main() {
|
||||||
if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
|
if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
|
||||||
auto platform = platforms[platform_id];
|
auto platform = platforms[platform_id];
|
||||||
|
|
||||||
// Initializes the OpenCL device (note: example for GPU devices only)
|
// Initializes the OpenCL device
|
||||||
auto devices = std::vector<cl::Device>();
|
auto devices = std::vector<cl::Device>();
|
||||||
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
|
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
|
||||||
if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
|
if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
|
||||||
auto device = devices[device_id];
|
auto device = devices[device_id];
|
||||||
|
|
||||||
// Creates the OpenCL context, queue, and an event
|
// Creates the OpenCL context, queue, and an event
|
||||||
auto context = cl::Context({device});
|
auto context = cl::Context({device});
|
||||||
auto queue = cl::CommandQueue(context, device);
|
auto queue = cl::CommandQueue(context, device);
|
||||||
auto event = cl::Event();
|
auto event = cl_event{nullptr};
|
||||||
|
|
||||||
// Populate host matrices with some example data
|
// Populate host matrices with some example data
|
||||||
auto host_a = std::vector<float>(m*k);
|
auto host_a = std::vector<float>(m*k);
|
||||||
|
@ -84,8 +84,7 @@ int main() {
|
||||||
|
|
||||||
// Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision.
|
// Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision.
|
||||||
auto queue_plain = queue();
|
auto queue_plain = queue();
|
||||||
auto event_plain = event();
|
auto status = clblast::Gemm(clblast::Layout::kRowMajor,
|
||||||
auto status = Gemm(clblast::Layout::kRowMajor,
|
|
||||||
clblast::Transpose::kNo, clblast::Transpose::kNo,
|
clblast::Transpose::kNo, clblast::Transpose::kNo,
|
||||||
m, n, k,
|
m, n, k,
|
||||||
alpha,
|
alpha,
|
||||||
|
@ -93,15 +92,15 @@ int main() {
|
||||||
device_b(), 0, b_ld,
|
device_b(), 0, b_ld,
|
||||||
beta,
|
beta,
|
||||||
device_c(), 0, c_ld,
|
device_c(), 0, c_ld,
|
||||||
&queue_plain, &event_plain);
|
&queue_plain, &event);
|
||||||
|
|
||||||
// Record the execution time
|
// Record the execution time
|
||||||
event.wait();
|
clWaitForEvents(1, &event);
|
||||||
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
|
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
|
||||||
auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
|
auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
|
||||||
|
|
||||||
// Example completed. See "clblast.h" for status codes (0 -> success).
|
// Example completed. See "clblast.h" for status codes (0 -> success).
|
||||||
printf("Completed in %.3lf ms with status %d\n", time_ms, status);
|
printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, status);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -34,9 +34,9 @@ DEVICENAME_DEFAULT = "default"
|
||||||
# Attributes
|
# Attributes
|
||||||
DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"]
|
DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"]
|
||||||
DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
|
DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
|
||||||
KERNEL_ATTRIBUTES = ["precision", "kernel_family",
|
KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
|
||||||
"arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
|
ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
|
||||||
ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES
|
ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
|
||||||
|
|
||||||
# OpenCL vendor names and their short name
|
# OpenCL vendor names and their short name
|
||||||
VENDOR_NAMES = { "device_vendor": {
|
VENDOR_NAMES = { "device_vendor": {
|
||||||
|
@ -95,9 +95,18 @@ def RemoveDuplicates(df):
|
||||||
def RemoveEntriesByDevice(df, devicename):
|
def RemoveEntriesByDevice(df, devicename):
|
||||||
return df[df["device"] != devicename]
|
return df[df["device"] != devicename]
|
||||||
|
|
||||||
|
def RemoveEntriesByKernelFamily(df, familyname):
|
||||||
|
return df[df["kernel_family"] != familyname]
|
||||||
|
|
||||||
def GetEntriesByField(df, field, value):
|
def GetEntriesByField(df, field, value):
|
||||||
return df[df[field] == value]
|
return df[df[field] == value]
|
||||||
|
|
||||||
|
# Example usage:
|
||||||
|
# df = UpdateDatabase(df, (df["kernel_family"] == "xdot") & (df["arg_n"] == "67108864"), "arg_n", "2097152")
|
||||||
|
def UpdateDatabase(df, condition, field, value):
|
||||||
|
df.loc[condition, field] = value
|
||||||
|
return df
|
||||||
|
|
||||||
# Fixes the problem that some vendors use multiple different names
|
# Fixes the problem that some vendors use multiple different names
|
||||||
def SanitizeVendorNames(df):
|
def SanitizeVendorNames(df):
|
||||||
df = df.replace(VENDOR_NAMES)
|
df = df.replace(VENDOR_NAMES)
|
||||||
|
@ -120,7 +129,7 @@ def CalculateDefaults(df):
|
||||||
dfdefault = pd.DataFrame()
|
dfdefault = pd.DataFrame()
|
||||||
|
|
||||||
# Defaults per type/vendor
|
# Defaults per type/vendor
|
||||||
groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
|
groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
|
||||||
for name, dfgroup in groups:
|
for name, dfgroup in groups:
|
||||||
default_values = dfgroup.min(axis=0)
|
default_values = dfgroup.min(axis=0)
|
||||||
default_values["device"] = DEVICENAME_DEFAULT
|
default_values["device"] = DEVICENAME_DEFAULT
|
||||||
|
@ -129,8 +138,14 @@ def CalculateDefaults(df):
|
||||||
default_values["time"] = 0.0
|
default_values["time"] = 0.0
|
||||||
dfdefault = dfdefault.append(default_values, ignore_index=True)
|
dfdefault = dfdefault.append(default_values, ignore_index=True)
|
||||||
|
|
||||||
|
# Checks for mis-matched arguments
|
||||||
|
groups = dfdefault.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
|
||||||
|
for name, dfgroup in groups:
|
||||||
|
if len(dfgroup) != 1:
|
||||||
|
print("[WARNING] Entries for a single kernel with multiple argument values")
|
||||||
|
|
||||||
# Defaults in general
|
# Defaults in general
|
||||||
groups = df.groupby(KERNEL_ATTRIBUTES+["kernel"])
|
groups = df.groupby(KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
|
||||||
for name, dfgroup in groups:
|
for name, dfgroup in groups:
|
||||||
default_values = dfgroup.min(axis=0)
|
default_values = dfgroup.min(axis=0)
|
||||||
default_values["device_vendor"] = VENDOR_DEFAULT
|
default_values["device_vendor"] = VENDOR_DEFAULT
|
||||||
|
@ -273,7 +288,6 @@ for file_json in glob.glob(glob_json):
|
||||||
new_size = len(database.index)
|
new_size = len(database.index)
|
||||||
print("with "+str(new_size-old_size)+" new items")
|
print("with "+str(new_size-old_size)+" new items")
|
||||||
|
|
||||||
|
|
||||||
# Stores the modified database back to disk
|
# Stores the modified database back to disk
|
||||||
if len(glob.glob(glob_json)) >= 1:
|
if len(glob.glob(glob_json)) >= 1:
|
||||||
print("## Storing the database to disk...")
|
print("## Storing the database to disk...")
|
||||||
|
|
|
@ -22,7 +22,8 @@ D2CL = "cl_double2"
|
||||||
|
|
||||||
# Structure holding data-type and precision information
|
# Structure holding data-type and precision information
|
||||||
class DataType():
|
class DataType():
|
||||||
def __init__(self, name, template, scalars, buffertype):
|
def __init__(self, precision_name, name, template, scalars, buffertype):
|
||||||
|
self.precision_name = precision_name
|
||||||
self.name = name
|
self.name = name
|
||||||
self.template = template
|
self.template = template
|
||||||
self.alpha_cpp = scalars[0]
|
self.alpha_cpp = scalars[0]
|
||||||
|
@ -57,5 +58,10 @@ class DataType():
|
||||||
return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
|
return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
|
||||||
return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp
|
return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp
|
||||||
|
|
||||||
|
# Current scalar is complex
|
||||||
|
def IsComplex(self, scalar):
|
||||||
|
return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or
|
||||||
|
(scalar == "beta" and self.beta_cpp in [FLT2, DBL2]))
|
||||||
|
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
|
@ -8,15 +8,17 @@
|
||||||
# Cedric Nugteren <www.cedricnugteren.nl>
|
# Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
#
|
#
|
||||||
# This script automatically generates the bodies of the following files, creating the full CLBlast
|
# This script automatically generates the bodies of the following files, creating the full CLBlast
|
||||||
# API interface and implementation (C, C++, and clBLAS wrapper):
|
# API interface and implementation (C, C++, and reference BLAS wrappers):
|
||||||
# clblast.h
|
# clblast.h
|
||||||
# clblast.cc
|
# clblast.cc
|
||||||
# clblast_c.h
|
# clblast_c.h
|
||||||
# clblast_c.cc
|
# clblast_c.cc
|
||||||
# wrapper_clblas.h
|
# wrapper_clblas.h
|
||||||
|
# wrapper_cblas.h
|
||||||
# It also generates the main functions for the correctness and performance tests as found in
|
# It also generates the main functions for the correctness and performance tests as found in
|
||||||
# test/correctness/routines/levelX/xYYYY.cc
|
# test/correctness/routines/levelX/xYYYY.cc
|
||||||
# test/performance/routines/levelX/xYYYY.cc
|
# test/performance/routines/levelX/xYYYY.cc
|
||||||
|
# It also produces the API documentation found in doc/clblast.md
|
||||||
#
|
#
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
|
@ -31,75 +33,89 @@ from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Regular data-types
|
# Regular data-types
|
||||||
S = DataType("S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32)
|
S = DataType("S", "S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32)
|
||||||
D = DataType("D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64)
|
D = DataType("D", "D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64)
|
||||||
C = DataType("C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
|
C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
|
||||||
Z = DataType("Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464)
|
Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464)
|
||||||
|
|
||||||
# Special cases
|
# Special cases
|
||||||
Css = DataType("C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S
|
Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output
|
||||||
Zdd = DataType("Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D
|
Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output
|
||||||
Ccs = DataType("C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S
|
iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output
|
||||||
Zzd = DataType("Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D
|
iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output
|
||||||
|
iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output
|
||||||
|
iZ = DataType("Z", "iZ", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # As Z, but with integer output
|
||||||
|
Css = DataType("C", "C", FLT, [FLT, FLT, FLT, FLT], FLT2) # As C, but with constants from S
|
||||||
|
Zdd = DataType("Z", "Z", DBL, [DBL, DBL, DBL, DBL], DBL2) # As Z, but with constants from D
|
||||||
|
Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT, F2CL, FLT], FLT2) # As C, but with one constant from S
|
||||||
|
Zzd = DataType("Z", "Z", DBL2+","+DBL, [DBL2, DBL, D2CL, DBL], DBL2) # As Z, but with one constant from D
|
||||||
|
|
||||||
# C++ template data-types
|
# C++ template data-types
|
||||||
T = DataType("typename T", "T", ["T", "T", "T", "T"], "T") # regular routine
|
T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine
|
||||||
Tc = DataType("typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk
|
Tc = DataType("Tc", "typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk
|
||||||
TU = DataType("typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k
|
TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Populates a list of routines
|
# Populates a list of routines
|
||||||
routines = [
|
routines = [
|
||||||
[ # Level 1: vector-vector
|
[ # Level 1: vector-vector
|
||||||
#Routine(False, "1", "rotg", T, [S,D], [], [], [], [], ["a","b","c","s"], False, "Generate plane rotation"),
|
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
|
||||||
#Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["c","s"], False, "Apply plane rotation"),
|
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
|
||||||
Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], False, "Swap two vectors"),
|
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
|
||||||
Routine(True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], False, "Vector scaling"),
|
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
|
||||||
Routine(True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], False, "Vector copy"),
|
Routine(True, True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []),
|
||||||
Routine(True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], False, "Vector-times-constant plus vector"),
|
Routine(True, True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []),
|
||||||
Routine(True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two vectors"),
|
Routine(True, True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []),
|
||||||
Routine(True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors"),
|
Routine(True, True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []),
|
||||||
Routine(True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], True, "Dot product of two complex vectors, one conjugated"),
|
Routine(True, True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []),
|
||||||
|
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
|
||||||
|
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
|
||||||
|
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []),
|
||||||
|
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []),
|
||||||
|
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
|
||||||
|
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []),
|
||||||
|
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
|
||||||
|
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
|
||||||
],
|
],
|
||||||
[ # Level 2: matrix-vector
|
[ # Level 2: matrix-vector
|
||||||
Routine(True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General matrix-vector multiplication"),
|
Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []),
|
||||||
Routine(True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], False, "General banded matrix-vector multiplication"),
|
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []),
|
||||||
Routine(True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian matrix-vector multiplication"),
|
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []),
|
||||||
Routine(True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Hermitian banded matrix-vector multiplication"),
|
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []),
|
||||||
Routine(True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Hermitian packed matrix-vector multiplication"),
|
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []),
|
||||||
Routine(True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric matrix-vector multiplication"),
|
Routine(True, True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []),
|
||||||
Routine(True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], False, "Symmetric banded matrix-vector multiplication"),
|
Routine(True, True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []),
|
||||||
Routine(True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], False, "Symmetric packed matrix-vector multiplication"),
|
Routine(True, True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []),
|
||||||
Routine(True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular matrix-vector multiplication"),
|
Routine(True, True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []),
|
||||||
Routine(True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], True, "Triangular banded matrix-vector multiplication"),
|
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []),
|
||||||
Routine(True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], True, "Triangular packed matrix-vector multiplication"),
|
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []),
|
||||||
Routine(False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a triangular system of equations"),
|
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
|
||||||
Routine(False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], False, "Solves a banded triangular system of equations"),
|
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []),
|
||||||
Routine(False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], False, "Solves a packed triangular system of equations"),
|
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
|
||||||
# Level 2: matrix update
|
# Level 2: matrix update
|
||||||
Routine(True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 matrix update"),
|
Routine(True, True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []),
|
||||||
Routine(True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex matrix update"),
|
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []),
|
||||||
Routine(True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], False, "General rank-1 complex conjugated matrix update"),
|
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []),
|
||||||
Routine(True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Hermitian rank-1 matrix update"),
|
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []),
|
||||||
Routine(True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Hermitian packed rank-1 matrix update"),
|
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []),
|
||||||
Routine(True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Hermitian rank-2 matrix update"),
|
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []),
|
||||||
Routine(True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Hermitian packed rank-2 matrix update"),
|
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []),
|
||||||
Routine(True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], False, "Symmetric rank-1 matrix update"),
|
Routine(True, True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []),
|
||||||
Routine(True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], False, "Symmetric packed rank-1 matrix update"),
|
Routine(True, True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []),
|
||||||
Routine(True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], False, "Symmetric rank-2 matrix update"),
|
Routine(True, True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []),
|
||||||
Routine(True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], False, "Symmetric packed rank-2 matrix update"),
|
Routine(True, True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []),
|
||||||
],
|
],
|
||||||
[ # Level 3: matrix-matrix
|
[ # Level 3: matrix-matrix
|
||||||
Routine(True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "General matrix-matrix multiplication"),
|
Routine(True, True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []),
|
||||||
Routine(True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Symmetric matrix-matrix multiplication"),
|
Routine(True, True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []),
|
||||||
Routine(True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], False, "Hermitian matrix-matrix multiplication"),
|
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []),
|
||||||
Routine(True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a symmetric matrix"),
|
Routine(True, True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []),
|
||||||
Routine(True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], False, "Rank-K update of a hermitian matrix"),
|
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []),
|
||||||
Routine(True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a symmetric matrix"),
|
Routine(True, True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []),
|
||||||
Routine(True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], False, "Rank-2K update of a hermitian matrix"),
|
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []),
|
||||||
Routine(True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Triangular matrix-matrix multiplication"),
|
Routine(True, True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []),
|
||||||
Routine(False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], False, "Solves a triangular system of equations"),
|
Routine(False, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
|
||||||
]]
|
]]
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
@ -151,7 +167,7 @@ def clblast_h(routines):
|
||||||
result = ""
|
result = ""
|
||||||
for routine in routines:
|
for routine in routines:
|
||||||
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
|
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
|
||||||
result += routine.RoutineHeaderCPP(12)+";\n"
|
result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# The C++ API implementation (.cc)
|
# The C++ API implementation (.cc)
|
||||||
|
@ -161,10 +177,9 @@ def clblast_cc(routines):
|
||||||
indent1 = " "*(20 + routine.Length())
|
indent1 = " "*(20 + routine.Length())
|
||||||
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
|
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
|
||||||
if routine.implemented:
|
if routine.implemented:
|
||||||
result += routine.RoutineHeaderCPP(12)+" {\n"
|
result += routine.RoutineHeaderCPP(12, "")+" {\n"
|
||||||
result += " auto queue_cpp = Queue(*queue);\n"
|
result += " auto queue_cpp = Queue(*queue);\n"
|
||||||
result += " auto event_cpp = Event(*event);\n"
|
result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event);\n"
|
||||||
result += " auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event_cpp);\n"
|
|
||||||
result += " auto status = routine.SetUp();\n"
|
result += " auto status = routine.SetUp();\n"
|
||||||
result += " if (status != StatusCode::kSuccess) { return status; }\n"
|
result += " if (status != StatusCode::kSuccess) { return status; }\n"
|
||||||
result += " return routine.Do"+routine.name.capitalize()+"("
|
result += " return routine.Do"+routine.name.capitalize()+"("
|
||||||
|
@ -175,8 +190,8 @@ def clblast_cc(routines):
|
||||||
result += " return StatusCode::kNotImplemented;\n"
|
result += " return StatusCode::kNotImplemented;\n"
|
||||||
result += "}\n"
|
result += "}\n"
|
||||||
for flavour in routine.flavours:
|
for flavour in routine.flavours:
|
||||||
indent2 = " "*(23 + routine.Length() + len(flavour.template))
|
indent2 = " "*(34 + routine.Length() + len(flavour.template))
|
||||||
result += "template StatusCode "+routine.name.capitalize()+"<"+flavour.template+">("
|
result += "template StatusCode PUBLIC_API "+routine.name.capitalize()+"<"+flavour.template+">("
|
||||||
result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)])
|
result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)])
|
||||||
result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n"
|
result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n"
|
||||||
return result
|
return result
|
||||||
|
@ -189,7 +204,7 @@ def clblast_c_h(routines):
|
||||||
for routine in routines:
|
for routine in routines:
|
||||||
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
|
result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
|
||||||
for flavour in routine.flavours:
|
for flavour in routine.flavours:
|
||||||
result += routine.RoutineHeaderC(flavour, 20)+";\n"
|
result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# The C API implementation (.cc)
|
# The C API implementation (.cc)
|
||||||
|
@ -200,7 +215,7 @@ def clblast_c_cc(routines):
|
||||||
for flavour in routine.flavours:
|
for flavour in routine.flavours:
|
||||||
template = "<"+flavour.template+">" if routine.NoScalars() else ""
|
template = "<"+flavour.template+">" if routine.NoScalars() else ""
|
||||||
indent = " "*(26 + routine.Length() + len(template))
|
indent = " "*(26 + routine.Length() + len(template))
|
||||||
result += routine.RoutineHeaderC(flavour, 20)+" {\n"
|
result += routine.RoutineHeaderC(flavour, 20, "")+" {\n"
|
||||||
result += " auto status = clblast::"+routine.name.capitalize()+template+"("
|
result += " auto status = clblast::"+routine.name.capitalize()+template+"("
|
||||||
result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)])
|
result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)])
|
||||||
result += ",\n"+indent+"queue, event);"
|
result += ",\n"+indent+"queue, event);"
|
||||||
|
@ -213,17 +228,18 @@ def clblast_c_cc(routines):
|
||||||
def wrapper_clblas(routines):
|
def wrapper_clblas(routines):
|
||||||
result = ""
|
result = ""
|
||||||
for routine in routines:
|
for routine in routines:
|
||||||
|
if routine.has_tests:
|
||||||
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
|
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
|
||||||
if routine.NoScalars():
|
if routine.NoScalars():
|
||||||
result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n"
|
result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
|
||||||
for flavour in routine.flavours:
|
for flavour in routine.flavours:
|
||||||
indent = " "*(17 + routine.Length())
|
indent = " "*(17 + routine.Length())
|
||||||
result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n"
|
result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
|
||||||
arguments = routine.ArgumentsWrapper(flavour)
|
arguments = routine.ArgumentsWrapperCL(flavour)
|
||||||
if routine.scratch:
|
if routine.scratch:
|
||||||
result += " auto queue = Queue(queues[0]);\n"
|
result += " auto queue = Queue(queues[0]);\n"
|
||||||
result += " auto context = queue.GetContext();\n"
|
result += " auto context = queue.GetContext();\n"
|
||||||
result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, n*x_inc + x_offset);\n"
|
result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
|
||||||
arguments += ["scratch_buffer()"]
|
arguments += ["scratch_buffer()"]
|
||||||
result += " return clblas"+flavour.name+routine.name+"("
|
result += " return clblas"+flavour.name+routine.name+"("
|
||||||
result += (",\n"+indent).join([a for a in arguments])
|
result += (",\n"+indent).join([a for a in arguments])
|
||||||
|
@ -231,6 +247,51 @@ def wrapper_clblas(routines):
|
||||||
result += "\n}\n"
|
result += "\n}\n"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
# The wrapper to the reference CBLAS routines (for performance/correctness testing)
|
||||||
|
def wrapper_cblas(routines):
|
||||||
|
result = ""
|
||||||
|
for routine in routines:
|
||||||
|
if routine.has_tests:
|
||||||
|
result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames())
|
||||||
|
for flavour in routine.flavours:
|
||||||
|
indent = " "*(10 + routine.Length())
|
||||||
|
result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
|
||||||
|
arguments = routine.ArgumentsWrapperC(flavour)
|
||||||
|
|
||||||
|
# Double-precision scalars
|
||||||
|
for scalar in routine.scalars:
|
||||||
|
if flavour.IsComplex(scalar):
|
||||||
|
result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
|
||||||
|
|
||||||
|
# Special case for scalar outputs
|
||||||
|
assignment = ""
|
||||||
|
postfix = ""
|
||||||
|
endofline = ""
|
||||||
|
extra_argument = ""
|
||||||
|
for output_buffer in routine.outputs:
|
||||||
|
if output_buffer in routine.ScalarBuffersFirst():
|
||||||
|
if flavour in [C,Z]:
|
||||||
|
postfix += "_sub"
|
||||||
|
indent += " "
|
||||||
|
extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
|
||||||
|
elif output_buffer in routine.IndexBuffers():
|
||||||
|
assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
|
||||||
|
indent += " "*len(assignment)
|
||||||
|
else:
|
||||||
|
assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
|
||||||
|
if (flavour.name in ["Sc","Dz"]):
|
||||||
|
assignment = assignment+".real("
|
||||||
|
endofline += ")"
|
||||||
|
else:
|
||||||
|
assignment = assignment+" = "
|
||||||
|
indent += " "*len(assignment)
|
||||||
|
|
||||||
|
result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
|
||||||
|
result += (",\n"+indent).join([a for a in arguments])
|
||||||
|
result += extra_argument+endofline+");"
|
||||||
|
result += "\n}\n"
|
||||||
|
return result
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Checks for the number of command-line arguments
|
# Checks for the number of command-line arguments
|
||||||
|
@ -246,9 +307,10 @@ files = [
|
||||||
path_clblast+"/include/clblast_c.h",
|
path_clblast+"/include/clblast_c.h",
|
||||||
path_clblast+"/src/clblast_c.cc",
|
path_clblast+"/src/clblast_c.cc",
|
||||||
path_clblast+"/test/wrapper_clblas.h",
|
path_clblast+"/test/wrapper_clblas.h",
|
||||||
|
path_clblast+"/test/wrapper_cblas.h",
|
||||||
]
|
]
|
||||||
header_lines = [84, 63, 80, 24, 22]
|
header_lines = [84, 71, 93, 22, 29, 41]
|
||||||
footer_lines = [6, 3, 5, 2, 6]
|
footer_lines = [17, 71, 19, 14, 6, 6]
|
||||||
|
|
||||||
# Checks whether the command-line arguments are valid; exists otherwise
|
# Checks whether the command-line arguments are valid; exists otherwise
|
||||||
for f in files:
|
for f in files:
|
||||||
|
@ -282,6 +344,8 @@ for i in xrange(0,len(files)):
|
||||||
body += clblast_c_cc(routines[level-1])
|
body += clblast_c_cc(routines[level-1])
|
||||||
if i == 4:
|
if i == 4:
|
||||||
body += wrapper_clblas(routines[level-1])
|
body += wrapper_clblas(routines[level-1])
|
||||||
|
if i == 5:
|
||||||
|
body += wrapper_cblas(routines[level-1])
|
||||||
f.write("".join(file_header))
|
f.write("".join(file_header))
|
||||||
f.write(body)
|
f.write(body)
|
||||||
f.write("".join(file_footer))
|
f.write("".join(file_footer))
|
||||||
|
@ -291,6 +355,7 @@ for i in xrange(0,len(files)):
|
||||||
# Outputs all the correctness-test implementations
|
# Outputs all the correctness-test implementations
|
||||||
for level in [1,2,3]:
|
for level in [1,2,3]:
|
||||||
for routine in routines[level-1]:
|
for routine in routines[level-1]:
|
||||||
|
if routine.has_tests:
|
||||||
filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc"
|
filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc"
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
body = ""
|
body = ""
|
||||||
|
@ -315,6 +380,7 @@ for level in [1,2,3]:
|
||||||
# Outputs all the performance-test implementations
|
# Outputs all the performance-test implementations
|
||||||
for level in [1,2,3]:
|
for level in [1,2,3]:
|
||||||
for routine in routines[level-1]:
|
for routine in routines[level-1]:
|
||||||
|
if routine.has_tests:
|
||||||
filename = path_clblast+"/test/performance/routines/level"+str(level)+"/x"+routine.name+".cc"
|
filename = path_clblast+"/test/performance/routines/level"+str(level)+"/x"+routine.name+".cc"
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
body = ""
|
body = ""
|
||||||
|
@ -325,13 +391,13 @@ for level in [1,2,3]:
|
||||||
body += "using double2 = clblast::double2;\n\n"
|
body += "using double2 = clblast::double2;\n\n"
|
||||||
body += "// Main function (not within the clblast namespace)\n"
|
body += "// Main function (not within the clblast namespace)\n"
|
||||||
body += "int main(int argc, char *argv[]) {\n"
|
body += "int main(int argc, char *argv[]) {\n"
|
||||||
default = PrecisionToFullName(routine.flavours[0].name)
|
default = PrecisionToFullName(routine.flavours[0].precision_name)
|
||||||
body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
|
body += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
|
||||||
for precision in ["H","S","D","C","Z"]:
|
for precision in ["H","S","D","C","Z"]:
|
||||||
body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":"
|
body += " case clblast::Precision::k"+PrecisionToFullName(precision)+":"
|
||||||
found = False
|
found = False
|
||||||
for flavour in routine.flavours:
|
for flavour in routine.flavours:
|
||||||
if flavour.name == precision:
|
if flavour.precision_name == precision:
|
||||||
body += "\n clblast::RunClient<clblast::TestX"+routine.name+flavour.TestTemplate()
|
body += "\n clblast::RunClient<clblast::TestX"+routine.name+flavour.TestTemplate()
|
||||||
body += ">(argc, argv); break;\n"
|
body += ">(argc, argv); break;\n"
|
||||||
found = True
|
found = True
|
||||||
|
@ -345,3 +411,61 @@ for level in [1,2,3]:
|
||||||
f.write(footer)
|
f.write(footer)
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
|
# Outputs the API documentation
|
||||||
|
filename = path_clblast+"/doc/clblast.md"
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
|
||||||
|
# Outputs the header
|
||||||
|
f.write("CLBlast: API reference\n")
|
||||||
|
f.write("================\n")
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
# Loops over the routines
|
||||||
|
for level in [1,2,3]:
|
||||||
|
for routine in routines[level-1]:
|
||||||
|
if routine.implemented:
|
||||||
|
|
||||||
|
# Routine header
|
||||||
|
f.write("x"+routine.name.upper()+": "+routine.description+"\n")
|
||||||
|
f.write("-------------\n")
|
||||||
|
f.write("\n")
|
||||||
|
f.write(routine.details+"\n")
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
# Routine API
|
||||||
|
f.write("C++ API:\n")
|
||||||
|
f.write("```\n")
|
||||||
|
f.write(routine.RoutineHeaderCPP(12, "")+"\n")
|
||||||
|
f.write("```\n")
|
||||||
|
f.write("\n")
|
||||||
|
f.write("C API:\n")
|
||||||
|
f.write("```\n")
|
||||||
|
for flavour in routine.flavours:
|
||||||
|
f.write(routine.RoutineHeaderC(flavour, 20, "")+"\n")
|
||||||
|
f.write("```\n")
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
# Routine arguments
|
||||||
|
f.write("Arguments to "+routine.name.upper()+":\n")
|
||||||
|
f.write("\n")
|
||||||
|
for argument in routine.ArgumentsDoc():
|
||||||
|
f.write("* "+argument+"\n")
|
||||||
|
f.write("* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.\n")
|
||||||
|
f.write("* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.\n")
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
# Routine requirements
|
||||||
|
if len(routine.RequirementsDoc()) > 0:
|
||||||
|
f.write("Requirements for "+routine.name.upper()+":\n")
|
||||||
|
f.write("\n")
|
||||||
|
for requirement in routine.RequirementsDoc():
|
||||||
|
f.write("* "+requirement+"\n")
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
|
||||||
|
# Routine footer
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ==================================================================================================
|
||||||
|
|
|
@ -28,7 +28,7 @@ def OptionToCLBlast(x):
|
||||||
}[x]
|
}[x]
|
||||||
|
|
||||||
# As above, but for clBLAS data-types
|
# As above, but for clBLAS data-types
|
||||||
def OptionToWrapper(x):
|
def OptionToWrapperCL(x):
|
||||||
return {
|
return {
|
||||||
'layout': "clblasOrder",
|
'layout': "clblasOrder",
|
||||||
'a_transpose': "clblasTranspose",
|
'a_transpose': "clblasTranspose",
|
||||||
|
@ -39,16 +39,38 @@ def OptionToWrapper(x):
|
||||||
'diagonal': "clblasDiag",
|
'diagonal': "clblasDiag",
|
||||||
}[x]
|
}[x]
|
||||||
|
|
||||||
# Buffers without 'ld' or 'inc' parameter
|
# As above, but for CBLAS data-types
|
||||||
NO_LD_INC = ["dot","ap"]
|
def OptionToWrapperC(x):
|
||||||
|
return {
|
||||||
|
'layout': "CBLAS_ORDER",
|
||||||
|
'a_transpose': "CBLAS_TRANSPOSE",
|
||||||
|
'b_transpose': "CBLAS_TRANSPOSE",
|
||||||
|
'ab_transpose': "CBLAS_TRANSPOSE",
|
||||||
|
'side': "CBLAS_SIDE",
|
||||||
|
'triangle': "CBLAS_UPLO",
|
||||||
|
'diagonal': "CBLAS_DIAG",
|
||||||
|
}[x]
|
||||||
|
|
||||||
|
# Translates an option name to a documentation string
|
||||||
|
def OptionToDoc(x):
|
||||||
|
return {
|
||||||
|
'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.",
|
||||||
|
'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
|
||||||
|
'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
|
||||||
|
'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
|
||||||
|
'side': "The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142).",
|
||||||
|
'triangle': "The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).",
|
||||||
|
'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal.",
|
||||||
|
}[x]
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
# Class holding routine-specific information (e.g. name, which arguments, which precisions)
|
# Class holding routine-specific information (e.g. name, which arguments, which precisions)
|
||||||
class Routine():
|
class Routine():
|
||||||
def __init__(self, implemented, level, name, template, flavours, sizes, options,
|
def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
|
||||||
inputs, outputs, scalars, scratch, description):
|
inputs, outputs, scalars, scratch, description, details, requirements):
|
||||||
self.implemented = implemented
|
self.implemented = implemented
|
||||||
|
self.has_tests = has_tests
|
||||||
self.level = level
|
self.level = level
|
||||||
self.name = name
|
self.name = name
|
||||||
self.template = template
|
self.template = template
|
||||||
|
@ -60,6 +82,26 @@ class Routine():
|
||||||
self.scalars = scalars
|
self.scalars = scalars
|
||||||
self.scratch = scratch # Scratch buffer (e.g. for xDOT)
|
self.scratch = scratch # Scratch buffer (e.g. for xDOT)
|
||||||
self.description = description
|
self.description = description
|
||||||
|
self.details = details
|
||||||
|
self.requirements = requirements
|
||||||
|
|
||||||
|
# List of scalar buffers
|
||||||
|
def ScalarBuffersFirst(self):
|
||||||
|
return ["dot","nrm2","asum","sum","imax","imin"]
|
||||||
|
def ScalarBuffersSecond(self):
|
||||||
|
return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"]
|
||||||
|
|
||||||
|
# List of scalars other than alpha and beta
|
||||||
|
def OtherScalars(self):
|
||||||
|
return ["cos","sin"]
|
||||||
|
|
||||||
|
# List of buffers with unsigned int type
|
||||||
|
def IndexBuffers(self):
|
||||||
|
return ["imax","imin"]
|
||||||
|
|
||||||
|
# List of buffers without 'inc' or 'ld'
|
||||||
|
def BuffersWithoutLdInc(self):
|
||||||
|
return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"]
|
||||||
|
|
||||||
# Retrieves the number of characters in the routine's name
|
# Retrieves the number of characters in the routine's name
|
||||||
def Length(self):
|
def Length(self):
|
||||||
|
@ -87,6 +129,12 @@ class Routine():
|
||||||
return ["ap","a","b","c"]
|
return ["ap","a","b","c"]
|
||||||
return ["y","c"]
|
return ["y","c"]
|
||||||
|
|
||||||
|
# Distinguish between vectors and matrices
|
||||||
|
def BuffersVector(self):
|
||||||
|
return ["x","y"]
|
||||||
|
def BuffersMatrix(self):
|
||||||
|
return ["a","b","c","ap"]
|
||||||
|
|
||||||
# ==============================================================================================
|
# ==============================================================================================
|
||||||
|
|
||||||
# Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')
|
# Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')
|
||||||
|
@ -94,7 +142,7 @@ class Routine():
|
||||||
if (name in self.inputs) or (name in self.outputs):
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
a = [name+"_buffer"]
|
a = [name+"_buffer"]
|
||||||
b = [name+"_offset"]
|
b = [name+"_offset"]
|
||||||
c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else []
|
c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
|
||||||
return [", ".join(a+b+c)]
|
return [", ".join(a+b+c)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
@ -104,21 +152,32 @@ class Routine():
|
||||||
if (name in self.inputs) or (name in self.outputs):
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
a = [prefix+"cl_mem "+name+"_buffer"]
|
a = [prefix+"cl_mem "+name+"_buffer"]
|
||||||
b = ["const size_t "+name+"_offset"]
|
b = ["const size_t "+name+"_offset"]
|
||||||
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else []
|
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
|
||||||
|
return [", ".join(a+b+c)]
|
||||||
|
return []
|
||||||
|
|
||||||
|
# As above but as vectors
|
||||||
|
def BufferDefVector(self, name, flavour):
|
||||||
|
prefix = "const " if (name in self.inputs) else ""
|
||||||
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
|
a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"]
|
||||||
|
b = ["const size_t "+name+"_offset"]
|
||||||
|
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
|
||||||
return [", ".join(a+b+c)]
|
return [", ".join(a+b+c)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# As above but with Claduc buffers
|
# As above but with Claduc buffers
|
||||||
def BufferCladuc(self, name):
|
def BufferCladuc(self, name):
|
||||||
if (name in self.inputs) or (name in self.outputs):
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
a = ["Buffer<"+self.template.buffertype+">("+name+"_buffer)"]
|
buffertype = "unsigned int" if (name in self.IndexBuffers()) else self.template.buffertype
|
||||||
|
a = ["Buffer<"+buffertype+">("+name+"_buffer)"]
|
||||||
b = [name+"_offset"]
|
b = [name+"_offset"]
|
||||||
c = [name+"_"+self.Postfix(name)] if (name not in NO_LD_INC) else []
|
c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
|
||||||
return [", ".join(a+b+c)]
|
return [", ".join(a+b+c)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# As above but with a static cast for clBLAS wrapper
|
# As above but with a static cast for clBLAS wrapper
|
||||||
def BufferWrapper(self, name):
|
def BufferWrapperCL(self, name):
|
||||||
if (name in self.inputs) or (name in self.outputs):
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
a = [name+"_buffer"]
|
a = [name+"_buffer"]
|
||||||
b = [name+"_offset"]
|
b = [name+"_offset"]
|
||||||
|
@ -130,16 +189,47 @@ class Routine():
|
||||||
return [", ".join(a+b+c)]
|
return [", ".join(a+b+c)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# As above but with a static cast for CBLAS wrapper
|
||||||
|
def BufferWrapperC(self, name, flavour):
|
||||||
|
prefix = "const " if (name in self.inputs) else ""
|
||||||
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
|
if name == "sy1":
|
||||||
|
a = [name+"_buffer["+name+"_offset]"]
|
||||||
|
elif flavour.precision_name in ["C","Z"]:
|
||||||
|
a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"]
|
||||||
|
else:
|
||||||
|
a = ["&"+name+"_buffer["+name+"_offset]"]
|
||||||
|
c = []
|
||||||
|
if (name in ["x","y"]):
|
||||||
|
c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
|
||||||
|
elif (name in ["a","b","c"]):
|
||||||
|
c = [name+"_"+self.Postfix(name)]
|
||||||
|
return [", ".join(a+c)]
|
||||||
|
return []
|
||||||
|
|
||||||
# As above, but only data-types
|
# As above, but only data-types
|
||||||
def BufferType(self, name):
|
def BufferType(self, name):
|
||||||
prefix = "const " if (name in self.inputs) else ""
|
prefix = "const " if (name in self.inputs) else ""
|
||||||
if (name in self.inputs) or (name in self.outputs):
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
a = [prefix+"cl_mem"]
|
a = [prefix+"cl_mem"]
|
||||||
b = ["const size_t"]
|
b = ["const size_t"]
|
||||||
c = ["const size_t"] if (name not in NO_LD_INC) else []
|
c = ["const size_t"] if (name not in self.BuffersWithoutLdInc()) else []
|
||||||
return [", ".join(a+b+c)]
|
return [", ".join(a+b+c)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Retrieves the documentation of the buffers
|
||||||
|
def BufferDoc(self, name):
|
||||||
|
prefix = "const " if (name in self.inputs) else ""
|
||||||
|
inout = "input" if (name in self.inputs) else "output"
|
||||||
|
if (name in self.inputs) or (name in self.outputs):
|
||||||
|
math_name = name.upper()+" matrix" if (name in self.BuffersMatrix()) else name+" vector"
|
||||||
|
incld_description = "Leading dimension " if (name in self.BuffersMatrix()) else "Stride/increment "
|
||||||
|
a = ["`"+prefix+"cl_mem "+name+"_buffer`: OpenCL buffer to store the "+inout+" "+math_name+"."]
|
||||||
|
b = ["`const size_t "+name+"_offset`: The offset in elements from the start of the "+inout+" "+math_name+"."]
|
||||||
|
c = ["`const size_t "+name+"_"+self.Postfix(name)+"`: "+incld_description+"of the "+inout+" "+math_name+"."] if (name not in self.BuffersWithoutLdInc()) else []
|
||||||
|
return a+b+c
|
||||||
|
return []
|
||||||
|
|
||||||
# ==============================================================================================
|
# ==============================================================================================
|
||||||
|
|
||||||
# Retrieves the name of a scalar (alpha/beta)
|
# Retrieves the name of a scalar (alpha/beta)
|
||||||
|
@ -168,6 +258,14 @@ class Routine():
|
||||||
return [name]
|
return [name]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Retrieves the use of a scalar for CBLAS (alpha/beta)
|
||||||
|
def ScalarUseWrapperC(self, name, flavour):
|
||||||
|
if name in self.scalars:
|
||||||
|
if flavour.IsComplex(name):
|
||||||
|
return [name+"_array.data()"]
|
||||||
|
return [name]
|
||||||
|
return []
|
||||||
|
|
||||||
# Retrieves the definition of a scalar (alpha/beta)
|
# Retrieves the definition of a scalar (alpha/beta)
|
||||||
def ScalarDef(self, name, flavour):
|
def ScalarDef(self, name, flavour):
|
||||||
if name in self.scalars:
|
if name in self.scalars:
|
||||||
|
@ -192,6 +290,14 @@ class Routine():
|
||||||
return ["const "+flavour.beta_cpp]
|
return ["const "+flavour.beta_cpp]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Retrieves the documentation of a scalar
|
||||||
|
def ScalarDoc(self, name):
|
||||||
|
if name in self.scalars:
|
||||||
|
if name == "alpha":
|
||||||
|
return ["`const "+self.template.alpha_cpp+" "+name+"`: Input scalar constant."]
|
||||||
|
return ["`const "+self.template.beta_cpp+" "+name+"`: Input scalar constant."]
|
||||||
|
return []
|
||||||
|
|
||||||
# ==============================================================================================
|
# ==============================================================================================
|
||||||
|
|
||||||
# Retrieves a list of comma-separated sizes (m, n, k)
|
# Retrieves a list of comma-separated sizes (m, n, k)
|
||||||
|
@ -212,6 +318,13 @@ class Routine():
|
||||||
return [", ".join(["const size_t" for s in self.sizes])]
|
return [", ".join(["const size_t" for s in self.sizes])]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Retrieves the documentation of the sizes
|
||||||
|
def SizesDoc(self):
|
||||||
|
if self.sizes:
|
||||||
|
definitions = ["`const size_t "+s+"`: Integer size argument." for s in self.sizes]
|
||||||
|
return definitions
|
||||||
|
return []
|
||||||
|
|
||||||
# ==============================================================================================
|
# ==============================================================================================
|
||||||
|
|
||||||
# Retrieves a list of options
|
# Retrieves a list of options
|
||||||
|
@ -235,9 +348,16 @@ class Routine():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# As above, but now using clBLAS data-types
|
# As above, but now using clBLAS data-types
|
||||||
def OptionsDefWrapper(self):
|
def OptionsDefWrapperCL(self):
|
||||||
if self.options:
|
if self.options:
|
||||||
definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options]
|
definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options]
|
||||||
|
return [", ".join(definitions)]
|
||||||
|
return []
|
||||||
|
|
||||||
|
# As above, but now using CBLAS data-types
|
||||||
|
def OptionsDefWrapperC(self):
|
||||||
|
if self.options:
|
||||||
|
definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options]
|
||||||
return [", ".join(definitions)]
|
return [", ".join(definitions)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
@ -248,72 +368,129 @@ class Routine():
|
||||||
return [", ".join(definitions)]
|
return [", ".join(definitions)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# Retrieves the documentation of the options
|
||||||
|
def OptionsDoc(self):
|
||||||
|
if self.options:
|
||||||
|
definitions = ["`const "+OptionToCLBlast(o)+"`: "+OptionToDoc(o) for o in self.options]
|
||||||
|
return definitions
|
||||||
|
return []
|
||||||
|
|
||||||
# ==============================================================================================
|
# ==============================================================================================
|
||||||
|
|
||||||
# Retrieves a combination of all the argument names, with Claduc casts
|
# Retrieves a combination of all the argument names, with Claduc casts
|
||||||
def ArgumentsCladuc(self, flavour, indent):
|
def ArgumentsCladuc(self, flavour, indent):
|
||||||
return (self.Options() + self.Sizes() + self.BufferCladuc("dot") +
|
return (self.Options() + self.Sizes() +
|
||||||
|
list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersFirst()])) +
|
||||||
self.Scalar("alpha") +
|
self.Scalar("alpha") +
|
||||||
list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) +
|
list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) +
|
||||||
self.Scalar("beta") +
|
self.Scalar("beta") +
|
||||||
list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) +
|
list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) +
|
||||||
list(chain(*[self.Scalar(s) for s in ["d1","d2","a","b","c","s"]])))
|
list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
# Retrieves a combination of all the argument names, with CLBlast casts
|
# Retrieves a combination of all the argument names, with CLBlast casts
|
||||||
def ArgumentsCast(self, flavour, indent):
|
def ArgumentsCast(self, flavour, indent):
|
||||||
return (self.OptionsCast(indent) + self.Sizes() + self.Buffer("dot") +
|
return (self.OptionsCast(indent) + self.Sizes() +
|
||||||
|
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
|
||||||
self.ScalarUse("alpha", flavour) +
|
self.ScalarUse("alpha", flavour) +
|
||||||
list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
|
list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
|
||||||
self.ScalarUse("beta", flavour) +
|
self.ScalarUse("beta", flavour) +
|
||||||
list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
|
list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
|
||||||
list(chain(*[self.ScalarUse(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
|
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
# As above, but for the clBLAS wrapper
|
# As above, but for the clBLAS wrapper
|
||||||
def ArgumentsWrapper(self, flavour):
|
def ArgumentsWrapperCL(self, flavour):
|
||||||
return (self.Options() + self.Sizes() + self.BufferWrapper("dot") +
|
return (self.Options() + self.Sizes() +
|
||||||
|
list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) +
|
||||||
self.ScalarUseWrapper("alpha", flavour) +
|
self.ScalarUseWrapper("alpha", flavour) +
|
||||||
list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) +
|
list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) +
|
||||||
self.ScalarUseWrapper("beta", flavour) +
|
self.ScalarUseWrapper("beta", flavour) +
|
||||||
list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) +
|
list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) +
|
||||||
list(chain(*[self.ScalarUseWrapper(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
|
list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
|
# As above, but for the CBLAS wrapper
|
||||||
|
def ArgumentsWrapperC(self, flavour):
|
||||||
|
return (self.Options() + self.Sizes() +
|
||||||
|
self.ScalarUseWrapperC("alpha", flavour) +
|
||||||
|
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) +
|
||||||
|
self.ScalarUseWrapperC("beta", flavour) +
|
||||||
|
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) +
|
||||||
|
list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
# Retrieves a combination of all the argument definitions
|
# Retrieves a combination of all the argument definitions
|
||||||
def ArgumentsDef(self, flavour):
|
def ArgumentsDef(self, flavour):
|
||||||
return (self.OptionsDef() + self.SizesDef() + self.BufferDef("dot") +
|
return (self.OptionsDef() + self.SizesDef() +
|
||||||
|
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
|
||||||
self.ScalarDef("alpha", flavour) +
|
self.ScalarDef("alpha", flavour) +
|
||||||
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
|
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
|
||||||
self.ScalarDef("beta", flavour) +
|
self.ScalarDef("beta", flavour) +
|
||||||
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
|
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
|
||||||
list(chain(*[self.ScalarDef(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
|
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
# As above, but clBLAS wrapper plain datatypes
|
# As above, but clBLAS wrapper plain datatypes
|
||||||
def ArgumentsDefWrapper(self, flavour):
|
def ArgumentsDefWrapperCL(self, flavour):
|
||||||
return (self.OptionsDefWrapper() + self.SizesDef() + self.BufferDef("dot") +
|
return (self.OptionsDefWrapperCL() + self.SizesDef() +
|
||||||
|
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
|
||||||
self.ScalarDefPlain("alpha", flavour) +
|
self.ScalarDefPlain("alpha", flavour) +
|
||||||
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
|
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
|
||||||
self.ScalarDefPlain("beta", flavour) +
|
self.ScalarDefPlain("beta", flavour) +
|
||||||
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
|
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
|
||||||
list(chain(*[self.ScalarDefPlain(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
|
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
|
# As above, but CBLAS wrapper plain datatypes
|
||||||
|
def ArgumentsDefWrapperC(self, flavour):
|
||||||
|
return (self.OptionsDefWrapperC() + self.SizesDef() +
|
||||||
|
list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) +
|
||||||
|
self.ScalarDefPlain("alpha", flavour) +
|
||||||
|
list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) +
|
||||||
|
self.ScalarDefPlain("beta", flavour) +
|
||||||
|
list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) +
|
||||||
|
list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
# Retrieves a combination of all the argument types
|
# Retrieves a combination of all the argument types
|
||||||
def ArgumentsType(self, flavour):
|
def ArgumentsType(self, flavour):
|
||||||
return (self.OptionsType() + self.SizesType() + self.BufferType("dot") +
|
return (self.OptionsType() + self.SizesType() +
|
||||||
|
list(chain(*[self.BufferType(b) for b in self.ScalarBuffersFirst()])) +
|
||||||
self.ScalarType("alpha", flavour) +
|
self.ScalarType("alpha", flavour) +
|
||||||
list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) +
|
list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) +
|
||||||
self.ScalarType("beta", flavour) +
|
self.ScalarType("beta", flavour) +
|
||||||
list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) +
|
list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) +
|
||||||
list(chain(*[self.ScalarType(s, flavour) for s in ["d1","d2","a","b","c","s"]])))
|
list(chain(*[self.BufferType(b) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.ScalarType(s, flavour) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
|
# Retrieves a combination of all the argument types
|
||||||
|
def ArgumentsDoc(self):
|
||||||
|
return (self.OptionsDoc() + self.SizesDoc() +
|
||||||
|
list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) +
|
||||||
|
list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) +
|
||||||
|
self.ScalarDoc("alpha") +
|
||||||
|
list(chain(*[self.BufferDoc(b) for b in self.BuffersFirst()])) +
|
||||||
|
self.ScalarDoc("beta") +
|
||||||
|
list(chain(*[self.BufferDoc(b) for b in self.BuffersSecond()])) +
|
||||||
|
list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersSecond()])) +
|
||||||
|
list(chain(*[self.ScalarDoc(s) for s in self.OtherScalars()])))
|
||||||
|
|
||||||
|
# ==============================================================================================
|
||||||
|
|
||||||
|
# Retrieves a list of routine requirements for documentation
|
||||||
|
def RequirementsDoc(self):
|
||||||
|
return []
|
||||||
|
|
||||||
# ==============================================================================================
|
# ==============================================================================================
|
||||||
|
|
||||||
# Retrieves the C++ templated definition for a routine
|
# Retrieves the C++ templated definition for a routine
|
||||||
def RoutineHeaderCPP(self, spaces):
|
def RoutineHeaderCPP(self, spaces, default_event):
|
||||||
indent = " "*(spaces + self.Length())
|
indent = " "*(spaces + self.Length())
|
||||||
result = "template <"+self.template.name+">\n"
|
result = "template <"+self.template.name+">\n"
|
||||||
result += "StatusCode "+self.name.capitalize()+"("
|
result += "StatusCode "+self.name.capitalize()+"("
|
||||||
result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)])
|
result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)])
|
||||||
result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)"
|
result += ",\n"+indent+"cl_command_queue* queue, cl_event* event"+default_event+")"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# As above, but now without variable names
|
# As above, but now without variable names
|
||||||
|
@ -326,15 +503,15 @@ class Routine():
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# As above, but now for C
|
# As above, but now for C
|
||||||
def RoutineHeaderC(self, flavour, spaces):
|
def RoutineHeaderC(self, flavour, spaces, extra_qualifier):
|
||||||
indent = " "*(spaces + self.Length())
|
indent = " "*(spaces + self.Length())
|
||||||
result = "StatusCode CLBlast"+flavour.name+self.name+"("
|
result = "StatusCode"+extra_qualifier+" CLBlast"+flavour.name+self.name+"("
|
||||||
result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)])
|
result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)])
|
||||||
result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)"
|
result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# As above, but now for the clBLAS wrapper
|
# As above, but now for the clBLAS wrapper
|
||||||
def RoutineHeaderWrapper(self, flavour, def_only, spaces):
|
def RoutineHeaderWrapperCL(self, flavour, def_only, spaces):
|
||||||
template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
|
template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
|
||||||
indent = " "*(spaces + self.Length() + len(template))
|
indent = " "*(spaces + self.Length() + len(template))
|
||||||
result = ""
|
result = ""
|
||||||
|
@ -344,9 +521,16 @@ class Routine():
|
||||||
result += flavour.name
|
result += flavour.name
|
||||||
result += ">\n"
|
result += ">\n"
|
||||||
result += "clblasStatus clblasX"+self.name+template+"("
|
result += "clblasStatus clblasX"+self.name+template+"("
|
||||||
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)])
|
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)])
|
||||||
result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
|
result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
|
||||||
result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
|
result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
# As above, but now for the CBLAS wrapper
|
||||||
|
def RoutineHeaderWrapperC(self, flavour, def_only, spaces):
|
||||||
|
indent = " "*(spaces + self.Length())
|
||||||
|
result = "void cblasX"+self.name+"("
|
||||||
|
result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")"
|
||||||
|
return result
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
113
src/cache.cc
Normal file
113
src/cache.cc
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the caching functionality of compiled binaries and programs.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
|
#include "internal/cache.h"
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
namespace cache {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Stores the compiled binary or IR in the cache
|
||||||
|
void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
|
||||||
|
const Precision &precision, const std::string &routine_name) {
|
||||||
|
binary_cache_mutex_.lock();
|
||||||
|
binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
|
||||||
|
binary_cache_mutex_.unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the compiled program in the cache
|
||||||
|
void StoreProgramToCache(const Program &program, const Context &context,
|
||||||
|
const Precision &precision, const std::string &routine_name) {
|
||||||
|
program_cache_mutex_.lock();
|
||||||
|
program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name});
|
||||||
|
program_cache_mutex_.unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws
|
||||||
|
// otherwise.
|
||||||
|
const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
|
||||||
|
const std::string &routine_name) {
|
||||||
|
binary_cache_mutex_.lock();
|
||||||
|
for (auto &cached_binary: binary_cache_) {
|
||||||
|
if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
|
||||||
|
binary_cache_mutex_.unlock();
|
||||||
|
return cached_binary.binary;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
binary_cache_mutex_.unlock();
|
||||||
|
throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
|
||||||
|
// otherwise.
|
||||||
|
const Program& GetProgramFromCache(const Context &context, const Precision &precision,
|
||||||
|
const std::string &routine_name) {
|
||||||
|
program_cache_mutex_.lock();
|
||||||
|
for (auto &cached_program: program_cache_) {
|
||||||
|
if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
|
||||||
|
program_cache_mutex_.unlock();
|
||||||
|
return cached_program.program;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
program_cache_mutex_.unlock();
|
||||||
|
throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Queries the cache to see whether or not the compiled kernel is already there
|
||||||
|
bool BinaryIsInCache(const std::string &device_name, const Precision &precision,
|
||||||
|
const std::string &routine_name) {
|
||||||
|
binary_cache_mutex_.lock();
|
||||||
|
for (auto &cached_binary: binary_cache_) {
|
||||||
|
if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
|
||||||
|
binary_cache_mutex_.unlock();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
binary_cache_mutex_.unlock();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Queries the cache to see whether or not the compiled kernel is already there
|
||||||
|
bool ProgramIsInCache(const Context &context, const Precision &precision,
|
||||||
|
const std::string &routine_name) {
|
||||||
|
program_cache_mutex_.lock();
|
||||||
|
for (auto &cached_program: program_cache_) {
|
||||||
|
if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
|
||||||
|
program_cache_mutex_.unlock();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
program_cache_mutex_.unlock();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Clears the cache of stored binaries and programs
|
||||||
|
StatusCode ClearCache() {
|
||||||
|
binary_cache_mutex_.lock();
|
||||||
|
binary_cache_.clear();
|
||||||
|
binary_cache_mutex_.unlock();
|
||||||
|
program_cache_mutex_.lock();
|
||||||
|
program_cache_.clear();
|
||||||
|
program_cache_mutex_.unlock();
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace cache
|
||||||
|
} // namespace clblast
|
695
src/clblast.cc
695
src/clblast.cc
File diff suppressed because it is too large
Load diff
380
src/clblast_c.cc
380
src/clblast_c.cc
|
@ -13,9 +13,7 @@
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
extern "C" {
|
#include "clblast_c.h"
|
||||||
#include "clblast_c.h"
|
|
||||||
}
|
|
||||||
#include "clblast.h"
|
#include "clblast.h"
|
||||||
#include "internal/utilities.h"
|
#include "internal/utilities.h"
|
||||||
|
|
||||||
|
@ -27,6 +25,118 @@ using double2 = clblast::double2;
|
||||||
// BLAS level-1 (vector-vector) routines
|
// BLAS level-1 (vector-vector) routines
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
// ROTG
|
||||||
|
StatusCode CLBlastSrotg(cl_mem sa_buffer, const size_t sa_offset,
|
||||||
|
cl_mem sb_buffer, const size_t sb_offset,
|
||||||
|
cl_mem sc_buffer, const size_t sc_offset,
|
||||||
|
cl_mem ss_buffer, const size_t ss_offset,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Rotg<float>(sa_buffer, sa_offset,
|
||||||
|
sb_buffer, sb_offset,
|
||||||
|
sc_buffer, sc_offset,
|
||||||
|
ss_buffer, ss_offset,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
|
||||||
|
cl_mem sb_buffer, const size_t sb_offset,
|
||||||
|
cl_mem sc_buffer, const size_t sc_offset,
|
||||||
|
cl_mem ss_buffer, const size_t ss_offset,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Rotg<double>(sa_buffer, sa_offset,
|
||||||
|
sb_buffer, sb_offset,
|
||||||
|
sc_buffer, sc_offset,
|
||||||
|
ss_buffer, ss_offset,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ROTMG
|
||||||
|
StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Rotmg<float>(sd1_buffer, sd1_offset,
|
||||||
|
sd2_buffer, sd2_offset,
|
||||||
|
sx1_buffer, sx1_offset,
|
||||||
|
sy1_buffer, sy1_offset,
|
||||||
|
sparam_buffer, sparam_offset,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
|
||||||
|
cl_mem sd2_buffer, const size_t sd2_offset,
|
||||||
|
cl_mem sx1_buffer, const size_t sx1_offset,
|
||||||
|
const cl_mem sy1_buffer, const size_t sy1_offset,
|
||||||
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Rotmg<double>(sd1_buffer, sd1_offset,
|
||||||
|
sd2_buffer, sd2_offset,
|
||||||
|
sx1_buffer, sx1_offset,
|
||||||
|
sy1_buffer, sy1_offset,
|
||||||
|
sparam_buffer, sparam_offset,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ROT
|
||||||
|
StatusCode CLBlastSrot(const size_t n,
|
||||||
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const float cos,
|
||||||
|
const float sin,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Rot(n,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
y_buffer, y_offset, y_inc,
|
||||||
|
cos,
|
||||||
|
sin,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastDrot(const size_t n,
|
||||||
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
const double cos,
|
||||||
|
const double sin,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Rot(n,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
y_buffer, y_offset, y_inc,
|
||||||
|
cos,
|
||||||
|
sin,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ROTM
|
||||||
|
StatusCode CLBlastSrotm(const size_t n,
|
||||||
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Rotm<float>(n,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
y_buffer, y_offset, y_inc,
|
||||||
|
sparam_buffer, sparam_offset,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastDrotm(const size_t n,
|
||||||
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||||
|
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Rotm<double>(n,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
y_buffer, y_offset, y_inc,
|
||||||
|
sparam_buffer, sparam_offset,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
|
||||||
// SWAP
|
// SWAP
|
||||||
StatusCode CLBlastSswap(const size_t n,
|
StatusCode CLBlastSswap(const size_t n,
|
||||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
@ -281,6 +391,258 @@ StatusCode CLBlastZdotc(const size_t n,
|
||||||
return static_cast<StatusCode>(status);
|
return static_cast<StatusCode>(status);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NRM2
|
||||||
|
StatusCode CLBlastSnrm2(const size_t n,
|
||||||
|
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Nrm2<float>(n,
|
||||||
|
nrm2_buffer, nrm2_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastDnrm2(const size_t n,
|
||||||
|
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Nrm2<double>(n,
|
||||||
|
nrm2_buffer, nrm2_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastScnrm2(const size_t n,
|
||||||
|
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Nrm2<float2>(n,
|
||||||
|
nrm2_buffer, nrm2_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastDznrm2(const size_t n,
|
||||||
|
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Nrm2<double2>(n,
|
||||||
|
nrm2_buffer, nrm2_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ASUM
|
||||||
|
StatusCode CLBlastSasum(const size_t n,
|
||||||
|
cl_mem asum_buffer, const size_t asum_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Asum<float>(n,
|
||||||
|
asum_buffer, asum_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastDasum(const size_t n,
|
||||||
|
cl_mem asum_buffer, const size_t asum_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Asum<double>(n,
|
||||||
|
asum_buffer, asum_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastScasum(const size_t n,
|
||||||
|
cl_mem asum_buffer, const size_t asum_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Asum<float2>(n,
|
||||||
|
asum_buffer, asum_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastDzasum(const size_t n,
|
||||||
|
cl_mem asum_buffer, const size_t asum_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Asum<double2>(n,
|
||||||
|
asum_buffer, asum_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
// SUM
|
||||||
|
StatusCode CLBlastSsum(const size_t n,
|
||||||
|
cl_mem sum_buffer, const size_t sum_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Sum<float>(n,
|
||||||
|
sum_buffer, sum_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastDsum(const size_t n,
|
||||||
|
cl_mem sum_buffer, const size_t sum_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Sum<double>(n,
|
||||||
|
sum_buffer, sum_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastScsum(const size_t n,
|
||||||
|
cl_mem sum_buffer, const size_t sum_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Sum<float2>(n,
|
||||||
|
sum_buffer, sum_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastDzsum(const size_t n,
|
||||||
|
cl_mem sum_buffer, const size_t sum_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Sum<double2>(n,
|
||||||
|
sum_buffer, sum_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
// AMAX
|
||||||
|
StatusCode CLBlastiSamax(const size_t n,
|
||||||
|
cl_mem imax_buffer, const size_t imax_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Amax<float>(n,
|
||||||
|
imax_buffer, imax_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastiDamax(const size_t n,
|
||||||
|
cl_mem imax_buffer, const size_t imax_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Amax<double>(n,
|
||||||
|
imax_buffer, imax_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastiCamax(const size_t n,
|
||||||
|
cl_mem imax_buffer, const size_t imax_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Amax<float2>(n,
|
||||||
|
imax_buffer, imax_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastiZamax(const size_t n,
|
||||||
|
cl_mem imax_buffer, const size_t imax_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Amax<double2>(n,
|
||||||
|
imax_buffer, imax_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
// MAX
|
||||||
|
StatusCode CLBlastiSmax(const size_t n,
|
||||||
|
cl_mem imax_buffer, const size_t imax_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Max<float>(n,
|
||||||
|
imax_buffer, imax_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastiDmax(const size_t n,
|
||||||
|
cl_mem imax_buffer, const size_t imax_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Max<double>(n,
|
||||||
|
imax_buffer, imax_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastiCmax(const size_t n,
|
||||||
|
cl_mem imax_buffer, const size_t imax_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Max<float2>(n,
|
||||||
|
imax_buffer, imax_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastiZmax(const size_t n,
|
||||||
|
cl_mem imax_buffer, const size_t imax_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Max<double2>(n,
|
||||||
|
imax_buffer, imax_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
// MIN
|
||||||
|
StatusCode CLBlastiSmin(const size_t n,
|
||||||
|
cl_mem imin_buffer, const size_t imin_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Min<float>(n,
|
||||||
|
imin_buffer, imin_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastiDmin(const size_t n,
|
||||||
|
cl_mem imin_buffer, const size_t imin_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Min<double>(n,
|
||||||
|
imin_buffer, imin_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastiCmin(const size_t n,
|
||||||
|
cl_mem imin_buffer, const size_t imin_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Min<float2>(n,
|
||||||
|
imin_buffer, imin_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
StatusCode CLBlastiZmin(const size_t n,
|
||||||
|
cl_mem imin_buffer, const size_t imin_offset,
|
||||||
|
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||||
|
cl_command_queue* queue, cl_event* event) {
|
||||||
|
auto status = clblast::Min<double2>(n,
|
||||||
|
imin_buffer, imin_offset,
|
||||||
|
x_buffer, x_offset, x_inc,
|
||||||
|
queue, event);
|
||||||
|
return static_cast<StatusCode>(status);
|
||||||
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
// BLAS level-2 (matrix-vector) routines
|
// BLAS level-2 (matrix-vector) routines
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -2022,3 +2384,15 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Clears the cache of stored binaries
|
||||||
|
StatusCode CLBlastClearCache() {
|
||||||
|
return static_cast<StatusCode>(clblast::ClearCache());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fills the cache with binaries for a specific device
|
||||||
|
StatusCode CLBlastFillCache(const cl_device_id device) {
|
||||||
|
return static_cast<StatusCode>(clblast::FillCache(device));
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
|
@ -40,6 +40,7 @@ R"(
|
||||||
typedef float16 real16;
|
typedef float16 real16;
|
||||||
#define ZERO 0.0f
|
#define ZERO 0.0f
|
||||||
#define ONE 1.0f
|
#define ONE 1.0f
|
||||||
|
#define SMALLEST -1.0e37f
|
||||||
|
|
||||||
// Double-precision
|
// Double-precision
|
||||||
#elif PRECISION == 64
|
#elif PRECISION == 64
|
||||||
|
@ -50,6 +51,7 @@ R"(
|
||||||
typedef double16 real16;
|
typedef double16 real16;
|
||||||
#define ZERO 0.0
|
#define ZERO 0.0
|
||||||
#define ONE 1.0
|
#define ONE 1.0
|
||||||
|
#define SMALLEST -1.0e37
|
||||||
|
|
||||||
// Complex single-precision
|
// Complex single-precision
|
||||||
#elif PRECISION == 3232
|
#elif PRECISION == 3232
|
||||||
|
@ -64,6 +66,7 @@ R"(
|
||||||
real sC; real sD; real sE; real sF;} real16;
|
real sC; real sD; real sE; real sF;} real16;
|
||||||
#define ZERO 0.0f
|
#define ZERO 0.0f
|
||||||
#define ONE 1.0f
|
#define ONE 1.0f
|
||||||
|
#define SMALLEST -1.0e37f
|
||||||
|
|
||||||
// Complex Double-precision
|
// Complex Double-precision
|
||||||
#elif PRECISION == 6464
|
#elif PRECISION == 6464
|
||||||
|
@ -78,6 +81,16 @@ R"(
|
||||||
real sC; real sD; real sE; real sF;} real16;
|
real sC; real sD; real sE; real sF;} real16;
|
||||||
#define ZERO 0.0
|
#define ZERO 0.0
|
||||||
#define ONE 1.0
|
#define ONE 1.0
|
||||||
|
#define SMALLEST -1.0e37
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Single-element version of a complex number
|
||||||
|
#if PRECISION == 3232
|
||||||
|
typedef float singlereal;
|
||||||
|
#elif PRECISION == 6464
|
||||||
|
typedef double singlereal;
|
||||||
|
#else
|
||||||
|
typedef real singlereal;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
@ -109,6 +122,13 @@ R"(
|
||||||
#define SetToOne(a) a = ONE
|
#define SetToOne(a) a = ONE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// The absolute value (component-wise)
|
||||||
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
|
#define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)
|
||||||
|
#else
|
||||||
|
#define AbsoluteValue(value) value = fabs(value)
|
||||||
|
#endif
|
||||||
|
|
||||||
// Adds two complex variables
|
// Adds two complex variables
|
||||||
#if PRECISION == 3232 || PRECISION == 6464
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
#define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
|
#define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
|
||||||
|
|
140
src/kernels/level1/xamax.opencl
Normal file
140
src/kernels/level1/xamax.opencl
Normal file
|
@ -0,0 +1,140 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file contains the Xamax kernel. It implements an index of absolute max computation using
|
||||||
|
// reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is
|
||||||
|
// loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
|
||||||
|
// is executed with a single workgroup only, computing the final result.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||||
|
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||||
|
R"(
|
||||||
|
|
||||||
|
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
||||||
|
// this kernel file is used outside of the CLBlast library.
|
||||||
|
#ifndef WGS1
|
||||||
|
#define WGS1 64 // The local work-group size of the main kernel
|
||||||
|
#endif
|
||||||
|
#ifndef WGS2
|
||||||
|
#define WGS2 64 // The local work-group size of the epilogue kernel
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main reduction kernel, performing the loading and the majority of the operation
|
||||||
|
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||||
|
__kernel void Xamax(const int n,
|
||||||
|
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||||
|
__global singlereal* maxgm, __global unsigned int* imaxgm) {
|
||||||
|
__local singlereal maxlm[WGS1];
|
||||||
|
__local unsigned int imaxlm[WGS1];
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
const int wgid = get_group_id(0);
|
||||||
|
const int num_groups = get_num_groups(0);
|
||||||
|
|
||||||
|
// Performs loading and the first steps of the reduction
|
||||||
|
#if defined(ROUTINE_MAX) || defined(ROUTINE_MIN) // non-absolute version
|
||||||
|
singlereal max = SMALLEST;
|
||||||
|
#else
|
||||||
|
singlereal max = ZERO;
|
||||||
|
#endif
|
||||||
|
unsigned int imax = 0;
|
||||||
|
int id = wgid*WGS1 + lid;
|
||||||
|
while (id < n) {
|
||||||
|
const int x_index = id*x_inc + x_offset;
|
||||||
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
|
singlereal x = xgm[x_index].x;
|
||||||
|
#else
|
||||||
|
singlereal x = xgm[x_index];
|
||||||
|
#endif
|
||||||
|
#if defined(ROUTINE_MAX) // non-absolute maximum version
|
||||||
|
// nothing special here
|
||||||
|
#elif defined(ROUTINE_MIN) // non-absolute minimum version
|
||||||
|
x = -x;
|
||||||
|
#else
|
||||||
|
x = fabs(x);
|
||||||
|
#endif
|
||||||
|
if (x >= max) {
|
||||||
|
max = x;
|
||||||
|
imax = id*x_inc + x_offset;
|
||||||
|
}
|
||||||
|
id += WGS1*num_groups;
|
||||||
|
}
|
||||||
|
maxlm[lid] = max;
|
||||||
|
imaxlm[lid] = imax;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// Performs reduction in local memory
|
||||||
|
#pragma unroll
|
||||||
|
for (int s=WGS1/2; s>0; s=s>>1) {
|
||||||
|
if (lid < s) {
|
||||||
|
if (maxlm[lid + s] >= maxlm[lid]) {
|
||||||
|
maxlm[lid] = maxlm[lid + s];
|
||||||
|
imaxlm[lid] = imaxlm[lid + s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the per-workgroup result
|
||||||
|
if (lid == 0) {
|
||||||
|
maxgm[wgid] = maxlm[0];
|
||||||
|
imaxgm[wgid] = imaxlm[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
|
||||||
|
// be launched with a single workgroup only.
|
||||||
|
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||||
|
__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
|
||||||
|
const __global unsigned int* restrict imaxgm,
|
||||||
|
__global unsigned int* imax, const int imax_offset) {
|
||||||
|
__local singlereal maxlm[WGS2];
|
||||||
|
__local unsigned int imaxlm[WGS2];
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
|
||||||
|
// Performs the first step of the reduction while loading the data
|
||||||
|
if (maxgm[lid + WGS2] >= maxgm[lid]) {
|
||||||
|
maxlm[lid] = maxgm[lid + WGS2];
|
||||||
|
imaxlm[lid] = imaxgm[lid + WGS2];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
maxlm[lid] = maxgm[lid];
|
||||||
|
imaxlm[lid] = imaxgm[lid];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// Performs reduction in local memory
|
||||||
|
#pragma unroll
|
||||||
|
for (int s=WGS2/2; s>0; s=s>>1) {
|
||||||
|
if (lid < s) {
|
||||||
|
if (maxlm[lid + s] >= maxlm[lid]) {
|
||||||
|
maxlm[lid] = maxlm[lid + s];
|
||||||
|
imaxlm[lid] = imaxlm[lid + s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the final result
|
||||||
|
if (lid == 0) {
|
||||||
|
imax[imax_offset] = imaxlm[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// End of the C++11 raw string literal
|
||||||
|
)"
|
||||||
|
|
||||||
|
// =================================================================================================
|
111
src/kernels/level1/xasum.opencl
Normal file
111
src/kernels/level1/xasum.opencl
Normal file
|
@ -0,0 +1,111 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file contains the Xasum kernel. It implements a absolute sum computation using reduction
|
||||||
|
// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is loaded,
|
||||||
|
// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
|
||||||
|
// is executed with a single workgroup only, computing the final result.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||||
|
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||||
|
R"(
|
||||||
|
|
||||||
|
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
||||||
|
// this kernel file is used outside of the CLBlast library.
|
||||||
|
#ifndef WGS1
|
||||||
|
#define WGS1 64 // The local work-group size of the main kernel
|
||||||
|
#endif
|
||||||
|
#ifndef WGS2
|
||||||
|
#define WGS2 64 // The local work-group size of the epilogue kernel
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main reduction kernel, performing the loading and the majority of the operation
|
||||||
|
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||||
|
__kernel void Xasum(const int n,
|
||||||
|
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||||
|
__global real* output) {
|
||||||
|
__local real lm[WGS1];
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
const int wgid = get_group_id(0);
|
||||||
|
const int num_groups = get_num_groups(0);
|
||||||
|
|
||||||
|
// Performs loading and the first steps of the reduction
|
||||||
|
real acc;
|
||||||
|
SetToZero(acc);
|
||||||
|
int id = wgid*WGS1 + lid;
|
||||||
|
while (id < n) {
|
||||||
|
real x = xgm[id*x_inc + x_offset];
|
||||||
|
#if defined(ROUTINE_SUM) // non-absolute version
|
||||||
|
#else
|
||||||
|
AbsoluteValue(x);
|
||||||
|
#endif
|
||||||
|
Add(acc, acc, x);
|
||||||
|
id += WGS1*num_groups;
|
||||||
|
}
|
||||||
|
lm[lid] = acc;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// Performs reduction in local memory
|
||||||
|
#pragma unroll
|
||||||
|
for (int s=WGS1/2; s>0; s=s>>1) {
|
||||||
|
if (lid < s) {
|
||||||
|
Add(lm[lid], lm[lid], lm[lid + s]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the per-workgroup result
|
||||||
|
if (lid == 0) {
|
||||||
|
output[wgid] = lm[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
|
||||||
|
// be launched with a single workgroup only.
|
||||||
|
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||||
|
__kernel void XasumEpilogue(const __global real* restrict input,
|
||||||
|
__global real* asum, const int asum_offset) {
|
||||||
|
__local real lm[WGS2];
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
|
||||||
|
// Performs the first step of the reduction while loading the data
|
||||||
|
Add(lm[lid], input[lid], input[lid + WGS2]);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// Performs reduction in local memory
|
||||||
|
#pragma unroll
|
||||||
|
for (int s=WGS2/2; s>0; s=s>>1) {
|
||||||
|
if (lid < s) {
|
||||||
|
Add(lm[lid], lm[lid], lm[lid + s]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Computes the absolute value and stores the final result
|
||||||
|
if (lid == 0) {
|
||||||
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
|
asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number
|
||||||
|
#else
|
||||||
|
asum[asum_offset] = lm[0];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// End of the C++11 raw string literal
|
||||||
|
)"
|
||||||
|
|
||||||
|
// =================================================================================================
|
|
@ -30,7 +30,8 @@ __kernel void Xaxpy(const int n, const real alpha,
|
||||||
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
|
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
|
||||||
MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xgm[id*x_inc + x_offset]);
|
real xvalue = xgm[id*x_inc + x_offset];
|
||||||
|
MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xvalue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -45,7 +46,9 @@ __kernel void XaxpyFast(const int n, const real alpha,
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int w=0; w<WPT; ++w) {
|
for (int w=0; w<WPT; ++w) {
|
||||||
const int id = w*get_global_size(0) + get_global_id(0);
|
const int id = w*get_global_size(0) + get_global_id(0);
|
||||||
ygm[id] = MultiplyAddVector(ygm[id], alpha, xgm[id]);
|
realV xvalue = xgm[id];
|
||||||
|
realV yvalue = ygm[id];
|
||||||
|
ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
109
src/kernels/level1/xnrm2.opencl
Normal file
109
src/kernels/level1/xnrm2.opencl
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file contains the Xnrm2 kernel. It implements a squared norm computation using reduction
|
||||||
|
// kernels. Reduction is split in two parts. In the first (main) kernel the X vector is squared,
|
||||||
|
// followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
|
||||||
|
// is executed with a single workgroup only, computing the final result.
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
|
||||||
|
// literal). Comment-out this line for syntax-highlighting when developing.
|
||||||
|
R"(
|
||||||
|
|
||||||
|
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
||||||
|
// this kernel file is used outside of the CLBlast library.
|
||||||
|
#ifndef WGS1
|
||||||
|
#define WGS1 64 // The local work-group size of the main kernel
|
||||||
|
#endif
|
||||||
|
#ifndef WGS2
|
||||||
|
#define WGS2 64 // The local work-group size of the epilogue kernel
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main reduction kernel, performing the multiplication and the majority of the operation
|
||||||
|
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||||
|
__kernel void Xnrm2(const int n,
|
||||||
|
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||||
|
__global real* output) {
|
||||||
|
__local real lm[WGS1];
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
const int wgid = get_group_id(0);
|
||||||
|
const int num_groups = get_num_groups(0);
|
||||||
|
|
||||||
|
// Performs multiplication and the first steps of the reduction
|
||||||
|
real acc;
|
||||||
|
SetToZero(acc);
|
||||||
|
int id = wgid*WGS1 + lid;
|
||||||
|
while (id < n) {
|
||||||
|
real x1 = xgm[id*x_inc + x_offset];
|
||||||
|
real x2 = x1;
|
||||||
|
COMPLEX_CONJUGATE(x2);
|
||||||
|
MultiplyAdd(acc, x1, x2);
|
||||||
|
id += WGS1*num_groups;
|
||||||
|
}
|
||||||
|
lm[lid] = acc;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// Performs reduction in local memory
|
||||||
|
#pragma unroll
|
||||||
|
for (int s=WGS1/2; s>0; s=s>>1) {
|
||||||
|
if (lid < s) {
|
||||||
|
Add(lm[lid], lm[lid], lm[lid + s]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stores the per-workgroup result
|
||||||
|
if (lid == 0) {
|
||||||
|
output[wgid] = lm[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
|
||||||
|
// be launched with a single workgroup only.
|
||||||
|
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||||
|
__kernel void Xnrm2Epilogue(const __global real* restrict input,
|
||||||
|
__global real* nrm2, const int nrm2_offset) {
|
||||||
|
__local real lm[WGS2];
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
|
||||||
|
// Performs the first step of the reduction while loading the data
|
||||||
|
Add(lm[lid], input[lid], input[lid + WGS2]);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// Performs reduction in local memory
|
||||||
|
#pragma unroll
|
||||||
|
for (int s=WGS2/2; s>0; s=s>>1) {
|
||||||
|
if (lid < s) {
|
||||||
|
Add(lm[lid], lm[lid], lm[lid + s]);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Computes the square root and stores the final result
|
||||||
|
if (lid == 0) {
|
||||||
|
#if PRECISION == 3232 || PRECISION == 6464
|
||||||
|
nrm2[nrm2_offset].x = sqrt(lm[0].x); // the result is a non-complex number
|
||||||
|
#else
|
||||||
|
nrm2[nrm2_offset] = sqrt(lm[0]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// End of the C++11 raw string literal
|
||||||
|
)"
|
||||||
|
|
||||||
|
// =================================================================================================
|
|
@ -29,8 +29,9 @@ __kernel void Xscal(const int n, const real alpha,
|
||||||
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
|
for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
|
||||||
|
real xvalue = xgm[id*x_inc + x_offset];
|
||||||
real result;
|
real result;
|
||||||
Multiply(result, alpha, xgm[id*x_inc + x_offset]);
|
Multiply(result, alpha, xvalue);
|
||||||
xgm[id*x_inc + x_offset] = result;
|
xgm[id*x_inc + x_offset] = result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -45,8 +46,9 @@ __kernel void XscalFast(const int n, const real alpha,
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int w=0; w<WPT; ++w) {
|
for (int w=0; w<WPT; ++w) {
|
||||||
const int id = w*get_global_size(0) + get_global_id(0);
|
const int id = w*get_global_size(0) + get_global_id(0);
|
||||||
|
realV xvalue = xgm[id];
|
||||||
realV result;
|
realV result;
|
||||||
result = MultiplyVector(result, alpha, xgm[id]);
|
result = MultiplyVector(result, alpha, xvalue);
|
||||||
xgm[id] = result;
|
xgm[id] = result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
112
src/routine.cc
112
src/routine.cc
|
@ -11,18 +11,17 @@
|
||||||
//
|
//
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "internal/routine.h"
|
#include "internal/routine.h"
|
||||||
|
|
||||||
namespace clblast {
|
namespace clblast {
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// The cache of compiled OpenCL programs
|
|
||||||
template <typename T>
|
|
||||||
std::vector<typename Routine<T>::ProgramCache> Routine<T>::program_cache_;
|
|
||||||
|
|
||||||
// Constructor: not much here, because no status codes can be returned
|
// Constructor: not much here, because no status codes can be returned
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Routine<T>::Routine(Queue &queue, Event &event, const std::string &name,
|
Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name,
|
||||||
const std::vector<std::string> &routines, const Precision precision):
|
const std::vector<std::string> &routines, const Precision precision):
|
||||||
precision_(precision),
|
precision_(precision),
|
||||||
routine_name_(name),
|
routine_name_(name),
|
||||||
|
@ -43,9 +42,24 @@ Routine<T>::Routine(Queue &queue, Event &event, const std::string &name,
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Routine<T>::SetUp() {
|
StatusCode Routine<T>::SetUp() {
|
||||||
|
|
||||||
// Queries the cache to see whether or not the compiled kernel is already there. If not, it will
|
// Queries the cache to see whether or not the program (context-specific) is already there
|
||||||
// be built and added to the cache.
|
if (ProgramIsInCache()) { return StatusCode::kSuccess; }
|
||||||
if (!ProgramIsInCache()) {
|
|
||||||
|
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
|
||||||
|
// is, a program is created and stored in the cache
|
||||||
|
if (BinaryIsInCache()) {
|
||||||
|
try {
|
||||||
|
auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
|
||||||
|
auto program = Program(device_, context_, binary);
|
||||||
|
auto options = std::vector<std::string>();
|
||||||
|
program.Build(device_, options);
|
||||||
|
StoreProgramToCache(program);
|
||||||
|
} catch (...) { return StatusCode::kBuildProgramFailure; }
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
|
||||||
|
// program will be added to the cache.
|
||||||
|
|
||||||
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
|
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
|
||||||
auto extensions = device_.Capabilities();
|
auto extensions = device_.Capabilities();
|
||||||
|
@ -97,10 +111,11 @@ StatusCode Routine<T>::SetUp() {
|
||||||
}
|
}
|
||||||
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
|
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
|
||||||
|
|
||||||
// Store the compiled program in the cache
|
// Store the compiled binary and program in the cache
|
||||||
program_cache_.push_back({program, device_name_, precision_, routine_name_});
|
const auto binary = program.GetIR();
|
||||||
|
StoreBinaryToCache(binary);
|
||||||
|
StoreProgramToCache(program);
|
||||||
} catch (...) { return StatusCode::kBuildProgramFailure; }
|
} catch (...) { return StatusCode::kBuildProgramFailure; }
|
||||||
}
|
|
||||||
|
|
||||||
// No errors, normal termination of this function
|
// No errors, normal termination of this function
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
||||||
|
@ -111,7 +126,8 @@ StatusCode Routine<T>::SetUp() {
|
||||||
// Enqueues a kernel, waits for completion, and checks for errors
|
// Enqueues a kernel, waits for completion, and checks for errors
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
|
StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
|
||||||
const std::vector<size_t> &local) {
|
const std::vector<size_t> &local, EventPointer event,
|
||||||
|
std::vector<Event>& waitForEvents) {
|
||||||
|
|
||||||
// Tests for validity of the local thread sizes
|
// Tests for validity of the local thread sizes
|
||||||
if (local.size() > max_work_item_dimensions_) {
|
if (local.size() > max_work_item_dimensions_) {
|
||||||
|
@ -135,18 +151,21 @@ StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
|
||||||
|
|
||||||
// Launches the kernel (and checks for launch errors)
|
// Launches the kernel (and checks for launch errors)
|
||||||
try {
|
try {
|
||||||
kernel.Launch(queue_, global, local, event_);
|
kernel.Launch(queue_, global, local, event, waitForEvents);
|
||||||
} catch (...) { return StatusCode::kKernelLaunchError; }
|
} catch (...) { return StatusCode::kKernelLaunchError; }
|
||||||
|
|
||||||
// Waits for completion of the kernel
|
|
||||||
try {
|
|
||||||
queue_.Finish(event_);
|
|
||||||
} catch (...) { return StatusCode::kKernelRunError; }
|
|
||||||
|
|
||||||
// No errors, normal termination of this function
|
// No errors, normal termination of this function
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// As above, but without an event waiting list
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
|
||||||
|
const std::vector<size_t> &local, EventPointer event) {
|
||||||
|
auto emptyWaitingList = std::vector<Event>();
|
||||||
|
return RunKernel(kernel, global, local, event, emptyWaitingList);
|
||||||
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
|
// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
|
||||||
|
@ -156,7 +175,7 @@ StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buf
|
||||||
const size_t offset, const size_t ld, const size_t data_size) {
|
const size_t offset, const size_t ld, const size_t data_size) {
|
||||||
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
|
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
|
||||||
try {
|
try {
|
||||||
auto required_size = (ld*two + offset)*data_size;
|
auto required_size = (ld*(two-1) + one + offset)*data_size;
|
||||||
auto buffer_size = buffer.GetSize();
|
auto buffer_size = buffer.GetSize();
|
||||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
|
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
|
||||||
} catch (...) { return StatusCode::kInvalidMatrixA; }
|
} catch (...) { return StatusCode::kInvalidMatrixA; }
|
||||||
|
@ -170,7 +189,7 @@ StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buf
|
||||||
const size_t offset, const size_t ld, const size_t data_size) {
|
const size_t offset, const size_t ld, const size_t data_size) {
|
||||||
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
|
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
|
||||||
try {
|
try {
|
||||||
auto required_size = (ld*two + offset)*data_size;
|
auto required_size = (ld*(two-1) + one + offset)*data_size;
|
||||||
auto buffer_size = buffer.GetSize();
|
auto buffer_size = buffer.GetSize();
|
||||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
|
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
|
||||||
} catch (...) { return StatusCode::kInvalidMatrixB; }
|
} catch (...) { return StatusCode::kInvalidMatrixB; }
|
||||||
|
@ -184,7 +203,7 @@ StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buf
|
||||||
const size_t offset, const size_t ld, const size_t data_size) {
|
const size_t offset, const size_t ld, const size_t data_size) {
|
||||||
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
|
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
|
||||||
try {
|
try {
|
||||||
auto required_size = (ld*two + offset)*data_size;
|
auto required_size = (ld*(two-1) + one + offset)*data_size;
|
||||||
auto buffer_size = buffer.GetSize();
|
auto buffer_size = buffer.GetSize();
|
||||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
|
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
|
||||||
} catch (...) { return StatusCode::kInvalidMatrixC; }
|
} catch (...) { return StatusCode::kInvalidMatrixC; }
|
||||||
|
@ -212,7 +231,7 @@ StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, cons
|
||||||
const size_t inc, const size_t data_size) {
|
const size_t inc, const size_t data_size) {
|
||||||
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
|
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
|
||||||
try {
|
try {
|
||||||
auto required_size = (n*inc + offset)*data_size;
|
auto required_size = ((n-1)*inc + 1 + offset)*data_size;
|
||||||
auto buffer_size = buffer.GetSize();
|
auto buffer_size = buffer.GetSize();
|
||||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
|
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
|
||||||
} catch (...) { return StatusCode::kInvalidVectorX; }
|
} catch (...) { return StatusCode::kInvalidVectorX; }
|
||||||
|
@ -226,7 +245,7 @@ StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, cons
|
||||||
const size_t inc, const size_t data_size) {
|
const size_t inc, const size_t data_size) {
|
||||||
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
|
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
|
||||||
try {
|
try {
|
||||||
auto required_size = (n*inc + offset)*data_size;
|
auto required_size = ((n-1)*inc + 1 + offset)*data_size;
|
||||||
auto buffer_size = buffer.GetSize();
|
auto buffer_size = buffer.GetSize();
|
||||||
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
|
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
|
||||||
} catch (...) { return StatusCode::kInvalidVectorY; }
|
} catch (...) { return StatusCode::kInvalidVectorY; }
|
||||||
|
@ -248,11 +267,25 @@ StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, co
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Tests vector index for validity: checks for a valid increment, a valid OpenCL buffer, and for a
|
||||||
|
// sufficient buffer size.
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Routine<T>::TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
|
||||||
|
const size_t offset, const size_t data_size) {
|
||||||
|
try {
|
||||||
|
auto required_size = (n + offset)*data_size;
|
||||||
|
auto buffer_size = buffer.GetSize();
|
||||||
|
if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
|
||||||
|
} catch (...) { return StatusCode::kInvalidVectorDot; }
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Copies or transposes a matrix and pads/unpads it with zeros
|
// Copies or transposes a matrix and pads/unpads it with zeros
|
||||||
template <typename T>
|
template <typename T>
|
||||||
StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
|
StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
|
||||||
|
const size_t src_one, const size_t src_two,
|
||||||
const size_t src_ld, const size_t src_offset,
|
const size_t src_ld, const size_t src_offset,
|
||||||
const Buffer<T> &src,
|
const Buffer<T> &src,
|
||||||
const size_t dest_one, const size_t dest_two,
|
const size_t dest_one, const size_t dest_two,
|
||||||
|
@ -334,13 +367,13 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t
|
||||||
auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"],
|
auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"],
|
||||||
dest_two / db_["TRA_WPT"]};
|
dest_two / db_["TRA_WPT"]};
|
||||||
auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
|
auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event, waitForEvents);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
|
auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
|
||||||
Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])};
|
Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])};
|
||||||
auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
|
auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event, waitForEvents);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -348,13 +381,13 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t
|
||||||
auto global = std::vector<size_t>{dest_one / db_["COPY_VW"],
|
auto global = std::vector<size_t>{dest_one / db_["COPY_VW"],
|
||||||
dest_two / db_["COPY_WPT"]};
|
dest_two / db_["COPY_WPT"]};
|
||||||
auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
|
auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event, waitForEvents);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
|
auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
|
||||||
Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
|
Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
|
||||||
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
|
auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event, waitForEvents);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return status;
|
return status;
|
||||||
|
@ -363,29 +396,6 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
|
|
||||||
// otherwise.
|
|
||||||
template <typename T>
|
|
||||||
const Program& Routine<T>::GetProgramFromCache() const {
|
|
||||||
for (auto &cached_program: program_cache_) {
|
|
||||||
if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) {
|
|
||||||
return cached_program.program;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Queries the cache to see whether or not the compiled kernel is already there
|
|
||||||
template <typename T>
|
|
||||||
bool Routine<T>::ProgramIsInCache() const {
|
|
||||||
for (auto &cached_program: program_cache_) {
|
|
||||||
if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; }
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
|
||||||
|
|
||||||
// Compiles the templated class
|
// Compiles the templated class
|
||||||
template class Routine<float>;
|
template class Routine<float>;
|
||||||
template class Routine<double>;
|
template class Routine<double>;
|
||||||
|
|
112
src/routines/level1/xamax.cc
Normal file
112
src/routines/level1/xamax.cc
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xamax class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level1/xamax.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specific implementations to get the memory-type based on a template argument
|
||||||
|
template <> const Precision Xamax<float>::precision_ = Precision::kSingle;
|
||||||
|
template <> const Precision Xamax<double>::precision_ = Precision::kDouble;
|
||||||
|
template <> const Precision Xamax<float2>::precision_ = Precision::kComplexSingle;
|
||||||
|
template <> const Precision Xamax<double2>::precision_ = Precision::kComplexDouble;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
|
||||||
|
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/level1/xamax.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xamax<T>::DoAmax(const size_t n,
|
||||||
|
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
|
||||||
|
|
||||||
|
// Makes sure all dimensions are larger than zero
|
||||||
|
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// Tests the vectors for validity
|
||||||
|
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestVectorIndex(1, imax_buffer, imax_offset, sizeof(unsigned int));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Retrieves the Xamax kernels from the compiled binary
|
||||||
|
try {
|
||||||
|
const auto program = GetProgramFromCache();
|
||||||
|
auto kernel1 = Kernel(program, "Xamax");
|
||||||
|
auto kernel2 = Kernel(program, "XamaxEpilogue");
|
||||||
|
|
||||||
|
// Creates the buffer for intermediate values
|
||||||
|
auto temp_size = 2*db_["WGS2"];
|
||||||
|
auto temp_buffer1 = Buffer<T>(context_, temp_size);
|
||||||
|
auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
|
||||||
|
|
||||||
|
// Sets the kernel arguments
|
||||||
|
kernel1.SetArgument(0, static_cast<int>(n));
|
||||||
|
kernel1.SetArgument(1, x_buffer());
|
||||||
|
kernel1.SetArgument(2, static_cast<int>(x_offset));
|
||||||
|
kernel1.SetArgument(3, static_cast<int>(x_inc));
|
||||||
|
kernel1.SetArgument(4, temp_buffer1());
|
||||||
|
kernel1.SetArgument(5, temp_buffer2());
|
||||||
|
|
||||||
|
// Event waiting list
|
||||||
|
auto eventWaitList = std::vector<Event>();
|
||||||
|
|
||||||
|
// Launches the main kernel
|
||||||
|
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
|
||||||
|
auto local1 = std::vector<size_t>{db_["WGS1"]};
|
||||||
|
auto kernelEvent = Event();
|
||||||
|
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
eventWaitList.push_back(kernelEvent);
|
||||||
|
|
||||||
|
// Sets the arguments for the epilogue kernel
|
||||||
|
kernel2.SetArgument(0, temp_buffer1());
|
||||||
|
kernel2.SetArgument(1, temp_buffer2());
|
||||||
|
kernel2.SetArgument(2, imax_buffer());
|
||||||
|
kernel2.SetArgument(3, static_cast<int>(imax_offset));
|
||||||
|
|
||||||
|
// Launches the epilogue kernel
|
||||||
|
auto global2 = std::vector<size_t>{db_["WGS2"]};
|
||||||
|
auto local2 = std::vector<size_t>{db_["WGS2"]};
|
||||||
|
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Succesfully finished the computation
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xamax<float>;
|
||||||
|
template class Xamax<double>;
|
||||||
|
template class Xamax<float2>;
|
||||||
|
template class Xamax<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
109
src/routines/level1/xasum.cc
Normal file
109
src/routines/level1/xasum.cc
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xasum class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level1/xasum.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specific implementations to get the memory-type based on a template argument
|
||||||
|
template <> const Precision Xasum<float>::precision_ = Precision::kSingle;
|
||||||
|
template <> const Precision Xasum<double>::precision_ = Precision::kDouble;
|
||||||
|
template <> const Precision Xasum<float2>::precision_ = Precision::kComplexSingle;
|
||||||
|
template <> const Precision Xasum<double2>::precision_ = Precision::kComplexDouble;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
|
||||||
|
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/level1/xasum.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xasum<T>::DoAsum(const size_t n,
|
||||||
|
const Buffer<T> &asum_buffer, const size_t asum_offset,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
|
||||||
|
|
||||||
|
// Makes sure all dimensions are larger than zero
|
||||||
|
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// Tests the vectors for validity
|
||||||
|
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestVectorDot(1, asum_buffer, asum_offset, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Retrieves the Xasum kernels from the compiled binary
|
||||||
|
try {
|
||||||
|
const auto program = GetProgramFromCache();
|
||||||
|
auto kernel1 = Kernel(program, "Xasum");
|
||||||
|
auto kernel2 = Kernel(program, "XasumEpilogue");
|
||||||
|
|
||||||
|
// Creates the buffer for intermediate values
|
||||||
|
auto temp_size = 2*db_["WGS2"];
|
||||||
|
auto temp_buffer = Buffer<T>(context_, temp_size);
|
||||||
|
|
||||||
|
// Sets the kernel arguments
|
||||||
|
kernel1.SetArgument(0, static_cast<int>(n));
|
||||||
|
kernel1.SetArgument(1, x_buffer());
|
||||||
|
kernel1.SetArgument(2, static_cast<int>(x_offset));
|
||||||
|
kernel1.SetArgument(3, static_cast<int>(x_inc));
|
||||||
|
kernel1.SetArgument(4, temp_buffer());
|
||||||
|
|
||||||
|
// Event waiting list
|
||||||
|
auto eventWaitList = std::vector<Event>();
|
||||||
|
|
||||||
|
// Launches the main kernel
|
||||||
|
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
|
||||||
|
auto local1 = std::vector<size_t>{db_["WGS1"]};
|
||||||
|
auto kernelEvent = Event();
|
||||||
|
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
eventWaitList.push_back(kernelEvent);
|
||||||
|
|
||||||
|
// Sets the arguments for the epilogue kernel
|
||||||
|
kernel2.SetArgument(0, temp_buffer());
|
||||||
|
kernel2.SetArgument(1, asum_buffer());
|
||||||
|
kernel2.SetArgument(2, static_cast<int>(asum_offset));
|
||||||
|
|
||||||
|
// Launches the epilogue kernel
|
||||||
|
auto global2 = std::vector<size_t>{db_["WGS2"]};
|
||||||
|
auto local2 = std::vector<size_t>{db_["WGS2"]};
|
||||||
|
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Succesfully finished the computation
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xasum<float>;
|
||||||
|
template class Xasum<double>;
|
||||||
|
template class Xasum<float2>;
|
||||||
|
template class Xasum<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
|
@ -29,7 +29,7 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xaxpy<T>::Xaxpy(Queue &queue, Event &event, const std::string &name):
|
Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level1/level1.opencl"
|
#include "../../kernels/level1/level1.opencl"
|
||||||
|
@ -64,7 +64,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
|
||||||
|
|
||||||
// Retrieves the Xaxpy kernel from the compiled binary
|
// Retrieves the Xaxpy kernel from the compiled binary
|
||||||
try {
|
try {
|
||||||
auto& program = GetProgramFromCache();
|
const auto program = GetProgramFromCache();
|
||||||
auto kernel = Kernel(program, kernel_name);
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
// Sets the kernel arguments
|
// Sets the kernel arguments
|
||||||
|
@ -89,19 +89,16 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
|
||||||
if (use_fast_kernel) {
|
if (use_fast_kernel) {
|
||||||
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
||||||
auto local = std::vector<size_t>{db_["WGS"]};
|
auto local = std::vector<size_t>{db_["WGS"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event_);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
||||||
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
||||||
auto local = std::vector<size_t>{db_["WGS"]};
|
auto local = std::vector<size_t>{db_["WGS"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event_);
|
||||||
}
|
}
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
// Waits for all kernels to finish
|
|
||||||
queue_.Finish();
|
|
||||||
|
|
||||||
// Succesfully finished the computation
|
// Succesfully finished the computation
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
||||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
|
|
@ -29,7 +29,7 @@ template <> const Precision Xcopy<double2>::precision_ = Precision::kComplexDoub
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xcopy<T>::Xcopy(Queue &queue, Event &event, const std::string &name):
|
Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level1/level1.opencl"
|
#include "../../kernels/level1/level1.opencl"
|
||||||
|
@ -64,7 +64,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
|
||||||
|
|
||||||
// Retrieves the Xcopy kernel from the compiled binary
|
// Retrieves the Xcopy kernel from the compiled binary
|
||||||
try {
|
try {
|
||||||
auto& program = GetProgramFromCache();
|
const auto program = GetProgramFromCache();
|
||||||
auto kernel = Kernel(program, kernel_name);
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
// Sets the kernel arguments
|
// Sets the kernel arguments
|
||||||
|
@ -87,19 +87,16 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
|
||||||
if (use_fast_kernel) {
|
if (use_fast_kernel) {
|
||||||
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
||||||
auto local = std::vector<size_t>{db_["WGS"]};
|
auto local = std::vector<size_t>{db_["WGS"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event_);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
||||||
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
||||||
auto local = std::vector<size_t>{db_["WGS"]};
|
auto local = std::vector<size_t>{db_["WGS"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event_);
|
||||||
}
|
}
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
// Waits for all kernels to finish
|
|
||||||
queue_.Finish();
|
|
||||||
|
|
||||||
// Succesfully finished the computation
|
// Succesfully finished the computation
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
||||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
|
|
@ -29,7 +29,7 @@ template <> const Precision Xdot<double2>::precision_ = Precision::kComplexDoubl
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xdot<T>::Xdot(Queue &queue, Event &event, const std::string &name):
|
Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
|
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level1/xdot.opencl"
|
#include "../../kernels/level1/xdot.opencl"
|
||||||
|
@ -59,7 +59,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
|
||||||
|
|
||||||
// Retrieves the Xdot kernels from the compiled binary
|
// Retrieves the Xdot kernels from the compiled binary
|
||||||
try {
|
try {
|
||||||
auto& program = GetProgramFromCache();
|
const auto program = GetProgramFromCache();
|
||||||
auto kernel1 = Kernel(program, "Xdot");
|
auto kernel1 = Kernel(program, "Xdot");
|
||||||
auto kernel2 = Kernel(program, "XdotEpilogue");
|
auto kernel2 = Kernel(program, "XdotEpilogue");
|
||||||
|
|
||||||
|
@ -78,11 +78,16 @@ StatusCode Xdot<T>::DoDot(const size_t n,
|
||||||
kernel1.SetArgument(7, temp_buffer());
|
kernel1.SetArgument(7, temp_buffer());
|
||||||
kernel1.SetArgument(8, static_cast<int>(do_conjugate));
|
kernel1.SetArgument(8, static_cast<int>(do_conjugate));
|
||||||
|
|
||||||
|
// Event waiting list
|
||||||
|
auto eventWaitList = std::vector<Event>();
|
||||||
|
|
||||||
// Launches the main kernel
|
// Launches the main kernel
|
||||||
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
|
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
|
||||||
auto local1 = std::vector<size_t>{db_["WGS1"]};
|
auto local1 = std::vector<size_t>{db_["WGS1"]};
|
||||||
status = RunKernel(kernel1, global1, local1);
|
auto kernelEvent = Event();
|
||||||
|
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
|
eventWaitList.push_back(kernelEvent);
|
||||||
|
|
||||||
// Sets the arguments for the epilogue kernel
|
// Sets the arguments for the epilogue kernel
|
||||||
kernel2.SetArgument(0, temp_buffer());
|
kernel2.SetArgument(0, temp_buffer());
|
||||||
|
@ -92,12 +97,9 @@ StatusCode Xdot<T>::DoDot(const size_t n,
|
||||||
// Launches the epilogue kernel
|
// Launches the epilogue kernel
|
||||||
auto global2 = std::vector<size_t>{db_["WGS2"]};
|
auto global2 = std::vector<size_t>{db_["WGS2"]};
|
||||||
auto local2 = std::vector<size_t>{db_["WGS2"]};
|
auto local2 = std::vector<size_t>{db_["WGS2"]};
|
||||||
status = RunKernel(kernel2, global2, local2);
|
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
// Waits for all kernels to finish
|
|
||||||
queue_.Finish();
|
|
||||||
|
|
||||||
// Succesfully finished the computation
|
// Succesfully finished the computation
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
||||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
|
|
@ -21,7 +21,7 @@ namespace clblast {
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xdotc<T>::Xdotc(Queue &queue, Event &event, const std::string &name):
|
Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Xdot<T>(queue, event, name) {
|
Xdot<T>(queue, event, name) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ namespace clblast {
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xdotu<T>::Xdotu(Queue &queue, Event &event, const std::string &name):
|
Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Xdot<T>(queue, event, name) {
|
Xdot<T>(queue, event, name) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
109
src/routines/level1/xnrm2.cc
Normal file
109
src/routines/level1/xnrm2.cc
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||||
|
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||||
|
// width of 100 characters per line.
|
||||||
|
//
|
||||||
|
// Author(s):
|
||||||
|
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||||
|
//
|
||||||
|
// This file implements the Xnrm2 class (see the header for information about the class).
|
||||||
|
//
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
#include "internal/routines/level1/xnrm2.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace clblast {
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Specific implementations to get the memory-type based on a template argument
|
||||||
|
template <> const Precision Xnrm2<float>::precision_ = Precision::kSingle;
|
||||||
|
template <> const Precision Xnrm2<double>::precision_ = Precision::kDouble;
|
||||||
|
template <> const Precision Xnrm2<float2>::precision_ = Precision::kComplexSingle;
|
||||||
|
template <> const Precision Xnrm2<double2>::precision_ = Precision::kComplexDouble;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Constructor: forwards to base class constructor
|
||||||
|
template <typename T>
|
||||||
|
Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
|
||||||
|
Routine<T>(queue, event, name, {"Xdot"}, precision_) {
|
||||||
|
source_string_ =
|
||||||
|
#include "../../kernels/level1/xnrm2.opencl"
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// The main routine
|
||||||
|
template <typename T>
|
||||||
|
StatusCode Xnrm2<T>::DoNrm2(const size_t n,
|
||||||
|
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
|
||||||
|
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
|
||||||
|
|
||||||
|
// Makes sure all dimensions are larger than zero
|
||||||
|
if (n == 0) { return StatusCode::kInvalidDimension; }
|
||||||
|
|
||||||
|
// Tests the vectors for validity
|
||||||
|
auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
status = TestVectorDot(1, nrm2_buffer, nrm2_offset, sizeof(T));
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Retrieves the Xnrm2 kernels from the compiled binary
|
||||||
|
try {
|
||||||
|
const auto program = GetProgramFromCache();
|
||||||
|
auto kernel1 = Kernel(program, "Xnrm2");
|
||||||
|
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
|
||||||
|
|
||||||
|
// Creates the buffer for intermediate values
|
||||||
|
auto temp_size = 2*db_["WGS2"];
|
||||||
|
auto temp_buffer = Buffer<T>(context_, temp_size);
|
||||||
|
|
||||||
|
// Sets the kernel arguments
|
||||||
|
kernel1.SetArgument(0, static_cast<int>(n));
|
||||||
|
kernel1.SetArgument(1, x_buffer());
|
||||||
|
kernel1.SetArgument(2, static_cast<int>(x_offset));
|
||||||
|
kernel1.SetArgument(3, static_cast<int>(x_inc));
|
||||||
|
kernel1.SetArgument(4, temp_buffer());
|
||||||
|
|
||||||
|
// Event waiting list
|
||||||
|
auto eventWaitList = std::vector<Event>();
|
||||||
|
|
||||||
|
// Launches the main kernel
|
||||||
|
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
|
||||||
|
auto local1 = std::vector<size_t>{db_["WGS1"]};
|
||||||
|
auto kernelEvent = Event();
|
||||||
|
status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
eventWaitList.push_back(kernelEvent);
|
||||||
|
|
||||||
|
// Sets the arguments for the epilogue kernel
|
||||||
|
kernel2.SetArgument(0, temp_buffer());
|
||||||
|
kernel2.SetArgument(1, nrm2_buffer());
|
||||||
|
kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
|
||||||
|
|
||||||
|
// Launches the epilogue kernel
|
||||||
|
auto global2 = std::vector<size_t>{db_["WGS2"]};
|
||||||
|
auto local2 = std::vector<size_t>{db_["WGS2"]};
|
||||||
|
status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
|
||||||
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
|
// Succesfully finished the computation
|
||||||
|
return StatusCode::kSuccess;
|
||||||
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
|
||||||
|
// Compiles the templated class
|
||||||
|
template class Xnrm2<float>;
|
||||||
|
template class Xnrm2<double>;
|
||||||
|
template class Xnrm2<float2>;
|
||||||
|
template class Xnrm2<double2>;
|
||||||
|
|
||||||
|
// =================================================================================================
|
||||||
|
} // namespace clblast
|
|
@ -29,7 +29,7 @@ template <> const Precision Xscal<double2>::precision_ = Precision::kComplexDoub
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xscal<T>::Xscal(Queue &queue, Event &event, const std::string &name):
|
Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level1/level1.opencl"
|
#include "../../kernels/level1/level1.opencl"
|
||||||
|
@ -60,7 +60,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
|
||||||
|
|
||||||
// Retrieves the Xscal kernel from the compiled binary
|
// Retrieves the Xscal kernel from the compiled binary
|
||||||
try {
|
try {
|
||||||
auto& program = GetProgramFromCache();
|
const auto program = GetProgramFromCache();
|
||||||
auto kernel = Kernel(program, kernel_name);
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
// Sets the kernel arguments
|
// Sets the kernel arguments
|
||||||
|
@ -81,19 +81,16 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
|
||||||
if (use_fast_kernel) {
|
if (use_fast_kernel) {
|
||||||
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
||||||
auto local = std::vector<size_t>{db_["WGS"]};
|
auto local = std::vector<size_t>{db_["WGS"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event_);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
||||||
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
||||||
auto local = std::vector<size_t>{db_["WGS"]};
|
auto local = std::vector<size_t>{db_["WGS"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event_);
|
||||||
}
|
}
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
// Waits for all kernels to finish
|
|
||||||
queue_.Finish();
|
|
||||||
|
|
||||||
// Succesfully finished the computation
|
// Succesfully finished the computation
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
||||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
|
|
@ -29,7 +29,7 @@ template <> const Precision Xswap<double2>::precision_ = Precision::kComplexDoub
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xswap<T>::Xswap(Queue &queue, Event &event, const std::string &name):
|
Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level1/level1.opencl"
|
#include "../../kernels/level1/level1.opencl"
|
||||||
|
@ -64,7 +64,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
|
||||||
|
|
||||||
// Retrieves the Xswap kernel from the compiled binary
|
// Retrieves the Xswap kernel from the compiled binary
|
||||||
try {
|
try {
|
||||||
auto& program = GetProgramFromCache();
|
const auto program = GetProgramFromCache();
|
||||||
auto kernel = Kernel(program, kernel_name);
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
// Sets the kernel arguments
|
// Sets the kernel arguments
|
||||||
|
@ -87,19 +87,16 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
|
||||||
if (use_fast_kernel) {
|
if (use_fast_kernel) {
|
||||||
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
|
||||||
auto local = std::vector<size_t>{db_["WGS"]};
|
auto local = std::vector<size_t>{db_["WGS"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event_);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
|
||||||
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
|
||||||
auto local = std::vector<size_t>{db_["WGS"]};
|
auto local = std::vector<size_t>{db_["WGS"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event_);
|
||||||
}
|
}
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
// Waits for all kernels to finish
|
|
||||||
queue_.Finish();
|
|
||||||
|
|
||||||
// Succesfully finished the computation
|
// Succesfully finished the computation
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
||||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
|
|
@ -21,7 +21,7 @@ namespace clblast {
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xgbmv<T>::Xgbmv(Queue &queue, Event &event, const std::string &name):
|
Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Xgemv<T>(queue, event, name) {
|
Xgemv<T>(queue, event, name) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xgemv<T>::Xgemv(Queue &queue, Event &event, const std::string &name):
|
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
|
Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level2/xgemv.opencl"
|
#include "../../kernels/level2/xgemv.opencl"
|
||||||
|
@ -136,7 +136,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
|
||||||
|
|
||||||
// Retrieves the Xgemv kernel from the compiled binary
|
// Retrieves the Xgemv kernel from the compiled binary
|
||||||
try {
|
try {
|
||||||
auto& program = GetProgramFromCache();
|
const auto program = GetProgramFromCache();
|
||||||
auto kernel = Kernel(program, kernel_name);
|
auto kernel = Kernel(program, kernel_name);
|
||||||
|
|
||||||
// Sets the kernel arguments
|
// Sets the kernel arguments
|
||||||
|
@ -162,12 +162,9 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
|
||||||
// Launches the kernel
|
// Launches the kernel
|
||||||
auto global = std::vector<size_t>{global_size};
|
auto global = std::vector<size_t>{global_size};
|
||||||
auto local = std::vector<size_t>{local_size};
|
auto local = std::vector<size_t>{local_size};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event_);
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
// Waits for all kernels to finish
|
|
||||||
queue_.Finish();
|
|
||||||
|
|
||||||
// Succesfully finished the computation
|
// Succesfully finished the computation
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
||||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
|
|
@ -29,7 +29,7 @@ template <> const Precision Xger<double2>::precision_ = Precision::kComplexDoubl
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xger<T>::Xger(Queue &queue, Event &event, const std::string &name):
|
Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Routine<T>(queue, event, name, {"Xger"}, precision_) {
|
Routine<T>(queue, event, name, {"Xger"}, precision_) {
|
||||||
source_string_ =
|
source_string_ =
|
||||||
#include "../../kernels/level2/level2.opencl"
|
#include "../../kernels/level2/level2.opencl"
|
||||||
|
@ -66,7 +66,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
|
||||||
|
|
||||||
// Retrieves the Xgemv kernel from the compiled binary
|
// Retrieves the Xgemv kernel from the compiled binary
|
||||||
try {
|
try {
|
||||||
auto& program = GetProgramFromCache();
|
const auto program = GetProgramFromCache();
|
||||||
auto kernel = Kernel(program, "Xger");
|
auto kernel = Kernel(program, "Xger");
|
||||||
|
|
||||||
// Sets the kernel arguments
|
// Sets the kernel arguments
|
||||||
|
@ -89,12 +89,9 @@ StatusCode Xger<T>::DoGer(const Layout layout,
|
||||||
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
|
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
|
||||||
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
|
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
|
||||||
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
|
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
|
||||||
status = RunKernel(kernel, global, local);
|
status = RunKernel(kernel, global, local, event_);
|
||||||
if (ErrorIn(status)) { return status; }
|
if (ErrorIn(status)) { return status; }
|
||||||
|
|
||||||
// Waits for all kernels to finish
|
|
||||||
queue_.Finish();
|
|
||||||
|
|
||||||
// Succesfully finished the computation
|
// Succesfully finished the computation
|
||||||
return StatusCode::kSuccess;
|
return StatusCode::kSuccess;
|
||||||
} catch (...) { return StatusCode::kInvalidKernel; }
|
} catch (...) { return StatusCode::kInvalidKernel; }
|
||||||
|
|
|
@ -20,7 +20,7 @@ namespace clblast {
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xgerc<T>::Xgerc(Queue &queue, Event &event, const std::string &name):
|
Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Xger<T>(queue, event, name) {
|
Xger<T>(queue, event, name) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ namespace clblast {
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xgeru<T>::Xgeru(Queue &queue, Event &event, const std::string &name):
|
Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Xger<T>(queue, event, name) {
|
Xger<T>(queue, event, name) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ namespace clblast {
|
||||||
|
|
||||||
// Constructor: forwards to base class constructor
|
// Constructor: forwards to base class constructor
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Xhbmv<T>::Xhbmv(Queue &queue, Event &event, const std::string &name):
|
Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name):
|
||||||
Xgemv<T>(queue, event, name) {
|
Xgemv<T>(queue, event, name) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue