Merge pull request #127 from CNugteren/development

Update to version 0.10.0
pull/156/head 0.10.0
Cedric Nugteren 2016-11-27 15:59:21 +01:00 committed by GitHub
commit e52f9a9ff2
278 changed files with 17037 additions and 8161 deletions

View File

@ -1,4 +1,21 @@
Version 0.10.0
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
- Changed the enums in the C API to avoid potential name clashes with external code
- Added a Netlib CBLAS compatible API (not recommended for full control over performance)
- Greatly improved the way exceptions are handled in the library (thanks to 'intelfx')
- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
- Fixed a bug in the tests and samples related to waiting for an invalid event
- Fixed a bug in the SYRK/SYR2K/HERK/HER2K routines that would occur with specific tuning parameters
- Fixed a bug in the TRMM routine that would overwrite input data before consuming everything
- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
- Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS
- Added an option to run tuned kernels multiple times to average execution times
- Added an option to build a static version of the library
- Made it possible to use the command-line environmental vars everywhere and without re-running CMake
- Various minor fixes and enhancements
- Added tuned parameters for various devices (see README)
Version 0.9.0
- Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
- Improved performance significantly of rotated GEMV computations

View File

@ -18,14 +18,16 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
# CMake project details
project("clblast" C CXX)
set(clblast_VERSION_MAJOR 0)
set(clblast_VERSION_MINOR 9)
set(clblast_VERSION_MINOR 10)
set(clblast_VERSION_PATCH 0)
# Options and their default values
option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON)
option(SAMPLES "Enable compilation of the examples" OFF)
option(TUNERS "Enable compilation of the tuners" OFF)
option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
option(TESTS "Enable compilation of the correctness tests" OFF)
option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF)
# Compile in verbose mode with additional diagnostic messages
option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
@ -64,12 +66,24 @@ elseif(MSVC)
endif()
endif()
# DLL Settings
if(MSVC)
if(BUILD_SHARED_LIBS)
add_definitions(" /DCLBLAST_DLL")
endif()
endif(MSVC)
# C++ compiler settings
if(MSVC)
set(FLAGS "/Ox")
set(FLAGS "${FLAGS} /wd4715")
else()
set(FLAGS "-O3 -std=c++11")
set(FLAGS "-std=c++11")
if(VERBOSE)
set(FLAGS "${FLAGS} -O1 -g")
else()
set(FLAGS "${FLAGS} -O3")
endif()
if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
set(FLAGS "${FLAGS} -Wall -Wno-comment -Wno-return-type -Wno-switch -Wno-missing-noreturn")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
@ -134,9 +148,13 @@ endif()
# ==================================================================================================
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv)
set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
xgemm xgemm_direct xgemv)
set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
if(NETLIB)
set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
endif()
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@ -151,12 +169,16 @@ set(PRECISIONS 32 64 3232 6464 16)
set(SOURCES
src/database/database.cpp
src/routines/common.cpp
src/utilities/clblast_exceptions.cpp
src/utilities/utilities.cpp
src/cache.cpp
src/clblast.cpp
src/clblast_c.cpp
src/routine.cpp
src/utilities.cpp
)
if(NETLIB)
set(SOURCES ${SOURCES} src/clblast_netlib_c.cpp)
endif()
foreach(ROUTINE ${LEVEL1_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
endforeach()
@ -171,7 +193,12 @@ foreach(ROUTINE ${LEVELX_ROUTINES})
endforeach()
# Creates and links the library
add_library(clblast SHARED ${SOURCES})
if(BUILD_SHARED_LIBS)
add_library(clblast SHARED ${SOURCES})
else(BUILD_SHARED_LIBS)
add_library(clblast STATIC ${SOURCES})
endif()
target_link_libraries(clblast ${OPENCL_LIBRARIES})
# Includes directories: CLBlast and OpenCL
@ -183,7 +210,9 @@ target_include_directories(clblast PUBLIC
# Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
if(MSVC)
target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
if(BUILD_SHARED_LIBS)
target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
endif()
endif()
# Installs the library
@ -191,19 +220,19 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib)
install(FILES include/clblast.h DESTINATION include)
install(FILES include/clblast_c.h DESTINATION include)
install(FILES include/clblast_half.h DESTINATION include)
if(NETLIB)
install(FILES include/clblast_netlib_c.h DESTINATION include)
endif()
# Installs the config for find_package in dependent projects
install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)
# ==================================================================================================
# Sets a default platform ($DEVICEPLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests on
set(DEVICEPLATFORM )
if(DEFINED ENV{CLBLAST_DEVICE})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{CLBLAST_DEVICE})
endif()
if(DEFINED ENV{CLBLAST_PLATFORM})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{CLBLAST_PLATFORM})
# Install pkg-config file on Linux
if(UNIX)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/clblast.pc.in"
"${CMAKE_CURRENT_BINARY_DIR}/clblast.pc" @ONLY IMMEDIATE)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/clblast.pc
DESTINATION lib/pkgconfig)
endif()
# ==================================================================================================
@ -239,7 +268,7 @@ if(TUNERS)
# Visual Studio requires the sources of non-exported objects/libraries
set(TUNERS_COMMON )
if(MSVC)
set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities.cpp)
set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
endif()
# Adds tuning executables
@ -255,7 +284,7 @@ if(TUNERS)
set(ALLTUNERSDEPENDS )
foreach(KERNEL ${KERNELS})
foreach(PRECISION ${PRECISIONS})
set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_${KERNEL} -precision ${PRECISION} ${DEVICEPLATFORM})
set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_${KERNEL} -precision ${PRECISION})
endforeach()
set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL})
endforeach()
@ -272,9 +301,10 @@ if(CLIENTS OR TESTS)
set(REF_INCLUDES )
set(REF_LIBRARIES )
if(CLBLAS_FOUND)
find_package(Threads)
set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES})
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
if(MSVC)
add_definitions(" /DCLBLAST_REF_CLBLAS")
else()
add_definitions(" -DCLBLAST_REF_CLBLAS")
@ -283,7 +313,7 @@ if(CLIENTS OR TESTS)
if(CBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
if(MSVC)
add_definitions(" /DCLBLAST_REF_CBLAS")
else()
add_definitions(" -DCLBLAST_REF_CBLAS")
@ -301,7 +331,7 @@ if(CLIENTS)
# Visual Studio requires the sources of non-exported objects/libraries
set(CLIENTS_COMMON )
if(MSVC)
set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities.cpp test/performance/client.cpp)
set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities/utilities.cpp test/performance/client.cpp)
else()
# Creates the common performance-tests objects (requires CMake 2.8.8)
add_library(test_performance_common OBJECT test/performance/client.cpp)
@ -309,7 +339,7 @@ if(CLIENTS)
# Adds CLBlast's interface include paths because we can't link to CLBlast here
target_include_directories(test_performance_common PRIVATE
$<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
${clblast_SOURCE_DIR})
${clblast_SOURCE_DIR} ${REF_INCLUDES})
set(CLIENTS_COMMON ${CLIENTS_COMMON} $<TARGET_OBJECTS:test_performance_common>)
endif()
@ -348,7 +378,7 @@ if(TESTS)
# Visual Studio requires the sources of non-exported objects/libraries
set(TESTS_COMMON )
if(MSVC)
set(TESTS_COMMON ${TESTS_COMMON} src/utilities.cpp
set(TESTS_COMMON ${TESTS_COMMON} src/utilities/utilities.cpp
test/correctness/tester.cpp test/correctness/testblas.cpp)
else()
# Creates the common correctness-tests objects (requires CMake 2.8.8)
@ -356,7 +386,7 @@ if(TESTS)
test/correctness/tester.cpp test/correctness/testblas.cpp)
target_include_directories(test_correctness_common PUBLIC
$<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
${clblast_SOURCE_DIR})
${clblast_SOURCE_DIR} ${REF_INCLUDES})
set(TESTS_COMMON ${TESTS_COMMON} $<TARGET_OBJECTS:test_correctness_common>)
endif()
@ -381,14 +411,14 @@ if(TESTS)
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM})
add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE})
endforeach()
# Adds 'alltests' target: runs all tests
set(ALLTESTS )
set(ALLTESTSDEPENDS )
foreach(ROUTINE ${ROUTINES})
set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM})
set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE})
set(ALLTESTSDEPENDS clblast_test_${ROUTINE})
endforeach()
add_custom_target(alltests ${ALLTESTS} DEPENDS ${ALLTESTSDEPENDS})

20
CONTRIBUTING.md 100644
View File

@ -0,0 +1,20 @@
CLBlast: Contributing guidelines
================
For information about the CLBlast library, see the [README](README.md) file instead.
Tuning results
-------------
A [dedicated GitHub issue](https://github.com/CNugteren/CLBlast/issues/1) is available to post new tuning results. If you compiled with the tuners (see the [README](README.md) for instructions), ran one of the tuners on your device (or all perhaps?), and feel that these results should be included in the next release of CLBlast, please post them there. You can do this by attaching the JSON files to the issue (archived in a .ZIP file).
Code improvements and additions
-------------
Pull requests are welcome as long as they:
* Contain unit additions or modifications
* Follow the CLBlast coding style, which is loosely based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers. We use a tab-size of 2 spaces and a max-width of 100 characters.
* Are made against the `development` branch.

207
LICENSE
View File

@ -1,14 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
Copyright (c) 2015 Cedric Nugteren
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
1. Definitions.
http://www.apache.org/licenses/LICENSE-2.0
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2015 Cedric Nugteren
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -48,7 +48,7 @@ The pre-requisites for compilation of CLBlast are:
- Clang 3.3 or newer
- AppleClang 5.0 or newer
- ICC 14.0 or newer
- MSVC (Visual Studio) 2015 or newer
- MSVC (Visual Studio) 2013 or newer
* An OpenCL 1.1 or newer library, for example:
- Apple OpenCL
- NVIDIA CUDA SDK
@ -74,6 +74,10 @@ A custom installation folder can be specified when calling CMake:
cmake -DCMAKE_INSTALL_PREFIX=/path/to/install/directory ..
Building a static version of the library instead of shared one (.dylib/.so/.dll) can be done by disabling the `BUILD_SHARED_LIBS` option when calling CMake. For example:
cmake -DBUILD_SHARED_LIBS=OFF ..
Using the library
-------------
@ -90,6 +94,12 @@ Afterwards, any of CLBlast's routines can be called directly: there is no need t
cmake -DSAMPLES=ON ..
Furthermore, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler.
There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:
#include <clblast_netlib_c.h>
Using the tuners (optional)
-------------
@ -105,8 +115,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- GeForce GTX 750 Ti
- GeForce GTX 980
- GeForce GTX 1070
- GeForce GTX Titan
- GeForce GTX Titan X
- GeForce GTX TITAN
- GeForce GTX TITAN Black
- GeForce GTX TITAN X
- Tesla K20m
- Tesla K40m
* AMD GPUs:
@ -115,10 +126,12 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- Oland
- Pitcairn
- Tahiti
- Tonga
* Intel GPUs:
- HD Graphics 530
- HD Graphics Haswell Ultrabook GT2 Mobile
- HD Graphics 5500 BroadWell U-Processor GT2
- HD Graphics Haswell Ultrabook GT2 Mobile
- HD Graphics IvyBridge M GT2
- HD Graphics Skylake ULT GT2
- Iris
- Iris Pro
@ -134,9 +147,9 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
cmake -DTUNERS=ON ..
Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher).
Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.6.0 or higher).
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables.
The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
@ -168,7 +181,7 @@ To build these tests, another BLAS library is needed to serve as a reference. Th
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further.
All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. Further options can be supplied through the `CLBLAST_ARGUMENTS` environmental variable (e.g. export CLBLAST_ARGUMENTS="-full_test -cblas 1 -clblas 0" on a UNIX system).
Compiling the performance tests/clients (optional)
@ -277,11 +290,11 @@ The `samples/haxpy.c` example shows how to use these convencience functions when
Contributing
-------------
Contributions are welcome in the form of tuning results for OpenCL devices previously untested. Furthermore, merge requests are welcome as long as they contain unit additions or modifications. Furthermore, they should follow the CLBlast coding style, which is based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers.
Contributions are welcome in the form of tuning results for OpenCL devices previously untested or pull requests. See [the contributing guidelines](CONTRIBUTING.md) for more details.
The contributing authors (code, pull requests, testing) so far are:
* [Cedric Nugteren](http://www.cedricnugteren.nl) - main author
* [Cedric Nugteren](http://cnugteren.github.io) - main author
* [Anton Lokhmotov](https://github.com/psyhtest)
* [Dragan Djuric](https://github.com/blueberry)
* [Marco Hutter](http://marco-hutter.de/)
@ -289,6 +302,7 @@ The contributing authors (code, pull requests, testing) so far are:
* [Gian-Carlo Pascutto](https://github.com/gcp)
* [Ivan Shapovalov](https://github.com/intelfx)
* [Dimitri Van Assche](https://github.com/dvasschemacq)
* [Shehzan Mohammed](https://shehzan10.github.io)
Tuning and testing on a variety of OpenCL devices was made possible by:
@ -296,9 +310,10 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
* [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
* [dividiti](http://www.dividiti.com)
* [SURFsara HPC center](http://www.surfsara.com)
* [ArrayFire](http://arrayfire.org)
Support us
-------------
This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).
This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://cnugteren.github.io).

10
clblast.pc.in 100644
View File

@ -0,0 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix}
includedir=${prefix}/include
libdir=${exec_prefix}/lib
Name: CLBlast
Description: CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11
Version: @clblast_VERSION_MAJOR@.@clblast_VERSION_MINOR@.@clblast_VERSION_PATCH@
Libs: -L${libdir} -lclblast
Cflags: -I${includedir}

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -27,8 +27,8 @@
// Exports library functions under Windows when building a DLL. See also:
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
#ifdef _WIN32
#ifdef COMPILING_DLL
#if defined(_WIN32) && defined(CLBLAST_DLL)
#if defined(COMPILING_DLL)
#define PUBLIC_API __declspec(dllexport)
#else
#define PUBLIC_API __declspec(dllimport)
@ -46,14 +46,34 @@ enum class StatusCode {
// Status codes in common with the OpenCL standard
kSuccess = 0, // CL_SUCCESS
kOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE
kTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
kBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
kOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES
kOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY
kOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
kInvalidValue = -30, // CL_INVALID_VALUE
kInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE
kInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT
kInvalidBinary = -42, // CL_INVALID_BINARY
kInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS
kInvalidProgram = -44, // CL_INVALID_PROGRAM
kInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE
kInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME
kInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION
kInvalidKernel = -48, // CL_INVALID_KERNEL
kInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX
kInvalidArgValue = -50, // CL_INVALID_ARG_VALUE
kInvalidArgSize = -51, // CL_INVALID_ARG_SIZE
kInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS
kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
kInvalidTempBufferSize = -61, // CL_INVALID_BUFFER_SIZE
kInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET
kInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST
kInvalidEvent = -58, // CL_INVALID_EVENT
kInvalidOperation = -59, // CL_INVALID_OPERATION
kInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE
kInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE
// Status codes in common with the clBLAS library
kNotImplemented = -1024, // Routine or functionality not implemented yet
@ -75,13 +95,14 @@ enum class StatusCode {
kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
kKernelLaunchError = -2048, // Problem occurred when enqueuing the kernel
kKernelRunError = -2047, // Problem occurred while running the kernel
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
kDatabaseError = -2041, // Entry for the device was not found in the database
kUnknownError = -2040, // A catch-all error code representing an unspecified error
kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception
};
// Matrix layout and transpose types

File diff suppressed because it is too large Load Diff

View File

@ -25,6 +25,11 @@
#include <CL/opencl.h>
#endif
// MSVC 2013 doesn't fully support C99
#ifdef _MSC_VER
#define inline __inline
#endif
// =================================================================================================
// Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,

View File

@ -0,0 +1,920 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer
// copies automatically and running on the default OpenCL platform and device. For full control over
// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead.
//
// =================================================================================================
#ifndef CLBLAST_CLBLAST_NETLIB_C_H_
#define CLBLAST_CLBLAST_NETLIB_C_H_
// Exports library functions under Windows when building a DLL. See also:
// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
#if defined(_WIN32) && defined(CLBLAST_DLL)
#if defined(COMPILING_DLL)
#define PUBLIC_API __declspec(dllexport)
#else
#define PUBLIC_API __declspec(dllimport)
#endif
#else
#define PUBLIC_API
#endif
// The C interface
#ifdef __cplusplus
extern "C" {
#endif
// =================================================================================================
// Matrix layout and transpose types
typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101,
CLBlastLayoutColMajor = 102 } CLBlastLayout;
typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112,
CLBlastTransposeConjugate = 113 } CLBlastTranspose;
typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121,
CLBlastTriangleLower = 122 } CLBlastTriangle;
typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
// For full compatibility with CBLAS
typedef CLBlastLayout CBLAS_ORDER;
typedef CLBlastTranspose CBLAS_TRANSPOSE;
typedef CLBlastTriangle CBLAS_UPLO;
typedef CLBlastDiagonal CBLAS_DIAG;
typedef CLBlastSide CBLAS_SIDE;
#define CblasRowMajor CLBlastLayoutRowMajor
#define CblasColMajor CLBlastLayoutColMajor
#define CblasNoTrans CLBlastTransposeNo
#define CblasTrans CLBlastTransposeYes
#define CblasConjTrans CLBlastTransposeConjugate
#define CblasUpper CLBlastTriangleUpper
#define CblasLower CLBlastTriangleLower
#define CblasNonUnit CLBlastDiagonalNonUnit
#define CblasUnit CLBlastDiagonalUnit
#define CblasLeft CLBlastSideLeft
#define CblasRight CLBlastSideRight
// =================================================================================================
// BLAS level-1 (vector-vector) routines
// =================================================================================================
// Generate givens plane rotation: SROTG/DROTG
void PUBLIC_API cblas_srotg(float* sa,
float* sb,
float* sc,
float* ss);
void PUBLIC_API cblas_drotg(double* sa,
double* sb,
double* sc,
double* ss);
// Generate modified givens plane rotation: SROTMG/DROTMG
void PUBLIC_API cblas_srotmg(float* sd1,
float* sd2,
float* sx1,
const float sy1,
float* sparam);
void PUBLIC_API cblas_drotmg(double* sd1,
double* sd2,
double* sx1,
const double sy1,
double* sparam);
// Apply givens plane rotation: SROT/DROT
void PUBLIC_API cblas_srot(const int n,
float* x, const int x_inc,
float* y, const int y_inc,
const float cos,
const float sin);
void PUBLIC_API cblas_drot(const int n,
double* x, const int x_inc,
double* y, const int y_inc,
const double cos,
const double sin);
// Apply modified givens plane rotation: SROTM/DROTM
void PUBLIC_API cblas_srotm(const int n,
float* x, const int x_inc,
float* y, const int y_inc,
float* sparam);
void PUBLIC_API cblas_drotm(const int n,
double* x, const int x_inc,
double* y, const int y_inc,
double* sparam);
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
void PUBLIC_API cblas_sswap(const int n,
float* x, const int x_inc,
float* y, const int y_inc);
void PUBLIC_API cblas_dswap(const int n,
double* x, const int x_inc,
double* y, const int y_inc);
void PUBLIC_API cblas_cswap(const int n,
void* x, const int x_inc,
void* y, const int y_inc);
void PUBLIC_API cblas_zswap(const int n,
void* x, const int x_inc,
void* y, const int y_inc);
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
void PUBLIC_API cblas_sscal(const int n,
const float alpha,
float* x, const int x_inc);
void PUBLIC_API cblas_dscal(const int n,
const double alpha,
double* x, const int x_inc);
void PUBLIC_API cblas_cscal(const int n,
const void* alpha,
void* x, const int x_inc);
void PUBLIC_API cblas_zscal(const int n,
const void* alpha,
void* x, const int x_inc);
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
void PUBLIC_API cblas_scopy(const int n,
const float* x, const int x_inc,
float* y, const int y_inc);
void PUBLIC_API cblas_dcopy(const int n,
const double* x, const int x_inc,
double* y, const int y_inc);
void PUBLIC_API cblas_ccopy(const int n,
const void* x, const int x_inc,
void* y, const int y_inc);
void PUBLIC_API cblas_zcopy(const int n,
const void* x, const int x_inc,
void* y, const int y_inc);
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
void PUBLIC_API cblas_saxpy(const int n,
const float alpha,
const float* x, const int x_inc,
float* y, const int y_inc);
void PUBLIC_API cblas_daxpy(const int n,
const double alpha,
const double* x, const int x_inc,
double* y, const int y_inc);
void PUBLIC_API cblas_caxpy(const int n,
const void* alpha,
const void* x, const int x_inc,
void* y, const int y_inc);
void PUBLIC_API cblas_zaxpy(const int n,
const void* alpha,
const void* x, const int x_inc,
void* y, const int y_inc);
// Dot product of two vectors: SDOT/DDOT/HDOT
float PUBLIC_API cblas_sdot(const int n,
const float* x, const int x_inc,
const float* y, const int y_inc);
double PUBLIC_API cblas_ddot(const int n,
const double* x, const int x_inc,
const double* y, const int y_inc);
// Dot product of two complex vectors: CDOTU/ZDOTU
void PUBLIC_API cblas_cdotu_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot);
void PUBLIC_API cblas_zdotu_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot);
// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
void PUBLIC_API cblas_cdotc_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot);
void PUBLIC_API cblas_zdotc_sub(const int n,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* dot);
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
float PUBLIC_API cblas_snrm2(const int n,
const float* x, const int x_inc);
double PUBLIC_API cblas_dnrm2(const int n,
const double* x, const int x_inc);
float PUBLIC_API cblas_scnrm2(const int n,
const void* x, const int x_inc);
double PUBLIC_API cblas_dznrm2(const int n,
const void* x, const int x_inc);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
float PUBLIC_API cblas_sasum(const int n,
const float* x, const int x_inc);
double PUBLIC_API cblas_dasum(const int n,
const double* x, const int x_inc);
float PUBLIC_API cblas_scasum(const int n,
const void* x, const int x_inc);
double PUBLIC_API cblas_dzasum(const int n,
const void* x, const int x_inc);
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
float PUBLIC_API cblas_ssum(const int n,
const float* x, const int x_inc);
double PUBLIC_API cblas_dsum(const int n,
const double* x, const int x_inc);
float PUBLIC_API cblas_scsum(const int n,
const void* x, const int x_inc);
double PUBLIC_API cblas_dzsum(const int n,
const void* x, const int x_inc);
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
int PUBLIC_API cblas_isamax(const int n,
const float* x, const int x_inc);
int PUBLIC_API cblas_idamax(const int n,
const double* x, const int x_inc);
int PUBLIC_API cblas_icamax(const int n,
const void* x, const int x_inc);
int PUBLIC_API cblas_izamax(const int n,
const void* x, const int x_inc);
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
int PUBLIC_API cblas_ismax(const int n,
const float* x, const int x_inc);
int PUBLIC_API cblas_idmax(const int n,
const double* x, const int x_inc);
int PUBLIC_API cblas_icmax(const int n,
const void* x, const int x_inc);
int PUBLIC_API cblas_izmax(const int n,
const void* x, const int x_inc);
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
int PUBLIC_API cblas_ismin(const int n,
const float* x, const int x_inc);
int PUBLIC_API cblas_idmin(const int n,
const double* x, const int x_inc);
int PUBLIC_API cblas_icmin(const int n,
const void* x, const int x_inc);
int PUBLIC_API cblas_izmin(const int n,
const void* x, const int x_inc);
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n,
const float alpha,
const float* a, const int a_ld,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc);
void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n,
const double alpha,
const double* a, const int a_ld,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc);
void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n, const int kl, const int ku,
const float alpha,
const float* a, const int a_ld,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc);
void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n, const int kl, const int ku,
const double alpha,
const double* a, const int a_ld,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc);
void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n, const int kl, const int ku,
const void* alpha,
const void* a, const int a_ld,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n, const int kl, const int ku,
const void* alpha,
const void* a, const int a_ld,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const void* alpha,
const void* a, const int a_ld,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const void* alpha,
const void* a, const int a_ld,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n, const int k,
const void* alpha,
const void* a, const int a_ld,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n, const int k,
const void* alpha,
const void* a, const int a_ld,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const void* alpha,
const void* ap,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const void* alpha,
const void* ap,
const void* x, const int x_inc,
const void* beta,
void* y, const int y_inc);
// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const float alpha,
const float* a, const int a_ld,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc);
void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const double alpha,
const double* a, const int a_ld,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc);
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n, const int k,
const float alpha,
const float* a, const int a_ld,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc);
void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n, const int k,
const double alpha,
const double* a, const int a_ld,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc);
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const float alpha,
const float* ap,
const float* x, const int x_inc,
const float beta,
float* y, const int y_inc);
void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const double alpha,
const double* ap,
const double* x, const int x_inc,
const double beta,
double* y, const int y_inc);
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const float* a, const int a_ld,
float* x, const int x_inc);
void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const double* a, const int a_ld,
double* x, const int x_inc);
void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const void* a, const int a_ld,
void* x, const int x_inc);
void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const void* a, const int a_ld,
void* x, const int x_inc);
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n, const int k,
const float* a, const int a_ld,
float* x, const int x_inc);
void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n, const int k,
const double* a, const int a_ld,
double* x, const int x_inc);
void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc);
void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc);
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const float* ap,
float* x, const int x_inc);
void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const double* ap,
double* x, const int x_inc);
void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const void* ap,
void* x, const int x_inc);
void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const void* ap,
void* x, const int x_inc);
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const float* a, const int a_ld,
float* x, const int x_inc);
void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const double* a, const int a_ld,
double* x, const int x_inc);
void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const void* a, const int a_ld,
void* x, const int x_inc);
void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const void* a, const int a_ld,
void* x, const int x_inc);
// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n, const int k,
const float* a, const int a_ld,
float* x, const int x_inc);
void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n, const int k,
const double* a, const int a_ld,
double* x, const int x_inc);
void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc);
void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n, const int k,
const void* a, const int a_ld,
void* x, const int x_inc);
// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const float* ap,
float* x, const int x_inc);
void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const double* ap,
double* x, const int x_inc);
void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const void* ap,
void* x, const int x_inc);
void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int n,
const void* ap,
void* x, const int x_inc);
// General rank-1 matrix update: SGER/DGER/HGER
void PUBLIC_API cblas_sger(const CLBlastLayout layout,
const int m, const int n,
const float alpha,
const float* x, const int x_inc,
const float* y, const int y_inc,
float* a, const int a_ld);
void PUBLIC_API cblas_dger(const CLBlastLayout layout,
const int m, const int n,
const double alpha,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* a, const int a_ld);
// General rank-1 complex matrix update: CGERU/ZGERU
void PUBLIC_API cblas_cgeru(const CLBlastLayout layout,
const int m, const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld);
void PUBLIC_API cblas_zgeru(const CLBlastLayout layout,
const int m, const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld);
// General rank-1 complex conjugated matrix update: CGERC/ZGERC
void PUBLIC_API cblas_cgerc(const CLBlastLayout layout,
const int m, const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld);
void PUBLIC_API cblas_zgerc(const CLBlastLayout layout,
const int m, const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld);
// Hermitian rank-1 matrix update: CHER/ZHER
void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const float alpha,
const void* x, const int x_inc,
void* a, const int a_ld);
void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const double alpha,
const void* x, const int x_inc,
void* a, const int a_ld);
// Hermitian packed rank-1 matrix update: CHPR/ZHPR
void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const float alpha,
const void* x, const int x_inc,
void* ap);
void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const double alpha,
const void* x, const int x_inc,
void* ap);
// Hermitian rank-2 matrix update: CHER2/ZHER2
void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld);
void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* a, const int a_ld);
// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* ap);
void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const void* alpha,
const void* x, const int x_inc,
const void* y, const int y_inc,
void* ap);
// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const float alpha,
const float* x, const int x_inc,
float* a, const int a_ld);
void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const double alpha,
const double* x, const int x_inc,
double* a, const int a_ld);
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const float alpha,
const float* x, const int x_inc,
float* ap);
void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const double alpha,
const double* x, const int x_inc,
double* ap);
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const float alpha,
const float* x, const int x_inc,
const float* y, const int y_inc,
float* a, const int a_ld);
void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const double alpha,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* a, const int a_ld);
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const float alpha,
const float* x, const int x_inc,
const float* y, const int y_inc,
float* ap);
void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
const int n,
const double alpha,
const double* x, const int x_inc,
const double* y, const int y_inc,
double* ap);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// =================================================================================================
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const int m, const int n, const int k,
const float alpha,
const float* a, const int a_ld,
const float* b, const int b_ld,
const float beta,
float* c, const int c_ld);
void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const int m, const int n, const int k,
const double alpha,
const double* a, const int a_ld,
const double* b, const int b_ld,
const double beta,
double* c, const int c_ld);
void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const int m, const int n, const int k,
const void* alpha,
const void* a, const int a_ld,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
const int m, const int n, const int k,
const void* alpha,
const void* a, const int a_ld,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
const int m, const int n,
const float alpha,
const float* a, const int a_ld,
const float* b, const int b_ld,
const float beta,
float* c, const int c_ld);
void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
const int m, const int n,
const double alpha,
const double* a, const int a_ld,
const double* b, const int b_ld,
const double beta,
double* c, const int c_ld);
void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
const int n, const int k,
const float alpha,
const float* a, const int a_ld,
const float beta,
float* c, const int c_ld);
void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
const int n, const int k,
const double alpha,
const double* a, const int a_ld,
const double beta,
double* c, const int c_ld);
void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
const int n, const int k,
const void* alpha,
const void* a, const int a_ld,
const void* beta,
void* c, const int c_ld);
void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
const int n, const int k,
const void* alpha,
const void* a, const int a_ld,
const void* beta,
void* c, const int c_ld);
// Rank-K update of a hermitian matrix: CHERK/ZHERK
void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
const int n, const int k,
const float alpha,
const void* a, const int a_ld,
const float beta,
void* c, const int c_ld);
void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
const int n, const int k,
const double alpha,
const void* a, const int a_ld,
const double beta,
void* c, const int c_ld);
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
const int n, const int k,
const float alpha,
const float* a, const int a_ld,
const float* b, const int b_ld,
const float beta,
float* c, const int c_ld);
void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
const int n, const int k,
const double alpha,
const double* a, const int a_ld,
const double* b, const int b_ld,
const double beta,
double* c, const int c_ld);
void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
const int n, const int k,
const void* alpha,
const void* a, const int a_ld,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
const int n, const int k,
const void* alpha,
const void* a, const int a_ld,
const void* b, const int b_ld,
const void* beta,
void* c, const int c_ld);
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
const int n, const int k,
const void* alpha,
const void* a, const int a_ld,
const void* b, const int b_ld,
const float beta,
void* c, const int c_ld);
void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
const int n, const int k,
const void* alpha,
const void* a, const int a_ld,
const void* b, const int b_ld,
const double beta,
void* c, const int c_ld);
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int m, const int n,
const float alpha,
const float* a, const int a_ld,
float* b, const int b_ld);
void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int m, const int n,
const double alpha,
const double* a, const int a_ld,
double* b, const int b_ld);
void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld);
void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int m, const int n,
const float alpha,
const float* a, const int a_ld,
float* b, const int b_ld);
void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int m, const int n,
const double alpha,
const double* a, const int a_ld,
double* b, const int b_ld);
void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld);
void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld);
// =================================================================================================
// Extra non-BLAS routines (level-X)
// =================================================================================================
// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n,
const float alpha,
const float* a, const int a_ld,
float* b, const int b_ld);
void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n,
const double alpha,
const double* a, const int a_ld,
double* b, const int b_ld);
void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld);
void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
const int m, const int n,
const void* alpha,
const void* a, const int a_ld,
void* b, const int b_ld);
// =================================================================================================
#ifdef __cplusplus
} // extern "C"
#endif
// CLBLAST_CLBLAST_NETLIB_C_H_
#endif

View File

@ -106,14 +106,16 @@ void run_example_routine(const cl_device_id device) {
clock_t start = clock();
// Calls an example routine
StatusCode status = CLBlastSasum(n,
device_output, 0,
device_input, 0, 1,
&queue, &event);
CLBlastStatusCode status = CLBlastSasum(n,
device_output, 0,
device_input, 0, 1,
&queue, &event);
// Wait for completion
clWaitForEvents(1, &event);
clReleaseEvent(event);
if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
// Retrieves the execution time
clock_t diff = clock() - start;

View File

@ -74,18 +74,20 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_y, CL_TRUE, 0, m*sizeof(double), host_y, 0, NULL, NULL);
// Call the DGEMV routine.
StatusCode status = CLBlastDgemv(kRowMajor, kNo,
m, n,
alpha,
device_a, 0, a_ld,
device_x, 0, 1,
beta,
device_y, 0, 1,
&queue, &event);
CLBlastStatusCode status = CLBlastDgemv(CLBlastLayoutRowMajor, CLBlastTransposeNo,
m, n,
alpha,
device_a, 0, a_ld,
device_x, 0, 1,
beta,
device_y, 0, 1,
&queue, &event);
// Wait for completion
clWaitForEvents(1, &event);
clReleaseEvent(event);
if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed DGEMV with status %d\n", status);

View File

@ -71,14 +71,16 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
// Call the HAXPY routine.
StatusCode status = CLBlastHaxpy(n, alpha,
device_a, 0, 1,
device_b, 0, 1,
&queue, &event);
CLBlastStatusCode status = CLBlastHaxpy(n, alpha,
device_a, 0, 1,
device_b, 0, 1,
&queue, &event);
// Wait for completion
clWaitForEvents(1, &event);
clReleaseEvent(event);
if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
// Copies the result back to the host
clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);

View File

@ -67,14 +67,16 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
// Call the SASUM routine.
StatusCode status = CLBlastSasum(n,
device_output, 0,
device_input, 0, 1,
&queue, &event);
CLBlastStatusCode status = CLBlastSasum(n,
device_output, 0,
device_input, 0, 1,
&queue, &event);
// Wait for completion
clWaitForEvents(1, &event);
clReleaseEvent(event);
if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
// Copies the result back to the host
clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);

View File

@ -77,18 +77,21 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL);
// Call the SGEMM routine.
StatusCode status = CLBlastSgemm(kRowMajor, kNo, kNo,
m, n, k,
alpha,
device_a, 0, a_ld,
device_b, 0, b_ld,
beta,
device_c, 0, c_ld,
&queue, &event);
CLBlastStatusCode status = CLBlastSgemm(CLBlastLayoutRowMajor,
CLBlastTransposeNo, CLBlastTransposeNo,
m, n, k,
alpha,
device_a, 0, a_ld,
device_b, 0, b_ld,
beta,
device_c, 0, c_ld,
&queue, &event);
// Wait for completion
clWaitForEvents(1, &event);
clReleaseEvent(event);
if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed SGEMM with status %d\n", status);

View File

@ -95,8 +95,10 @@ int main() {
&queue_plain, &event);
// Record the execution time
clWaitForEvents(1, &event);
clReleaseEvent(event);
if (status == clblast::StatusCode::kSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}
auto elapsed_time = std::chrono::steady_clock::now() - start_time;
auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();

View File

@ -0,0 +1,69 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file demonstrates the use of the Netlib CBLAS API of the CLBlast library. This API is not
// recommended if you want full control over performance: it will internally copy buffers from and
// to the OpenCL device.
//
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
//
// =================================================================================================
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
// Includes the CLBlast library (Netlib CBLAS interface)
#include <clblast_netlib_c.h>
// =================================================================================================
// Example use of the single-precision routine SGEMM
int main(void) {
// Example SGEMM arguments
const int m = 128;
const int n = 64;
const int k = 512;
const float alpha = 0.7f;
const float beta = 1.0f;
const int a_ld = k;
const int b_ld = n;
const int c_ld = n;
// Populate host matrices with some example data
float* host_a = (float*)malloc(sizeof(float)*m*k);
float* host_b = (float*)malloc(sizeof(float)*n*k);
float* host_c = (float*)malloc(sizeof(float)*m*n);
for (int i=0; i<m*k; ++i) { host_a[i] = 12.193f; }
for (int i=0; i<n*k; ++i) { host_b[i] = -8.199f; }
for (int i=0; i<m*n; ++i) { host_c[i] = 0.0f; }
// Call the SGEMM routine.
cblas_sgemm(CLBlastLayoutRowMajor,
CLBlastTransposeNo, CLBlastTransposeNo,
m, n, k,
alpha,
host_a, a_ld,
host_b, b_ld,
beta,
host_c, c_ld);
// Example completed
printf("Completed SGEMM\n");
// Clean-up
free(host_a);
free(host_b);
free(host_c);
return 0;
}
// =================================================================================================

View File

@ -18,7 +18,7 @@ import database.bests as bests
import database.defaults as defaults
# Server storing a copy of the database
DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.json"
DATABASE_SERVER_URL = "https://raw.githubusercontent.com/CNugteren/CLBlast-database/master/database.json"
# OpenCL vendor names and their short name
VENDOR_TRANSLATION_TABLE = {

View File

@ -54,19 +54,20 @@ def get_cpp_header(family):
//
// This file populates the database with best-found tuning parameters for the '%s' kernels.
//\n"""
% family.title() + get_cpp_separator() + "\n\nnamespace clblast {\n" + get_cpp_separator())
% family.title() + get_cpp_separator() + \
"\n\nnamespace clblast {\n" + "namespace database {\n" + get_cpp_separator())
def get_cpp_footer():
"""Retrieves the C++ footer"""
return "\n} // namespace clblast\n"
return "\n} // namespace database\n" + "} // namespace clblast\n"
def get_cpp_precision(family, precision):
"""Retrieves the C++ code for the start of a new precision"""
precision_string = precision_to_string(precision)
camelcase_name = family.title().replace("_", "")
return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n \"%s\", Precision::k%s, {\n"
return("\n\nconst Database::DatabaseEntry %s%s = {\n \"%s\", Precision::k%s, {\n"
% (camelcase_name, precision_string, camelcase_name, precision_string))

View File

@ -5,6 +5,8 @@
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
import ast
from collections import defaultdict
import clblast
import bests
@ -137,6 +139,10 @@ def get_smallest_best_parameters(group):
return min_parameters
def get_parameter_names(section):
return [result["parameters"] for result in section["results"]]
def get_common_best_parameters(group, group_identifier, verbose):
"""Sets defaults based on the best values of entries supported by all devices. This might cause a problem in case
not every device was tuned with the same parameters. In that case it falls back to the above method to retrieve
@ -154,19 +160,48 @@ def get_common_best_parameters(group, group_identifier, verbose):
result["relative_performance"] = minimum_time / result["time"]
# Determine which parameters are available for all devices
common_parameters = [result["parameters"] for result in group[0]["results"]] # Parameters of the first section
common_parameters = get_parameter_names(group[0]) # Parameters of the first section
for i in range(1, num_devices):
section_parameters = [result["parameters"] for result in group[i]["results"]]
section_parameters = get_parameter_names(group[i])
common_parameters = [p for p in section_parameters if p in common_parameters] # Intersection of the parameters
# Fall back to another method in case there are no shared entries at all across devices
if len(common_parameters) == 0:
if verbose:
print("[database] No common kernels for: " + str(group_identifier) + " with devices: %d " % num_devices)
smallest_best_parameters = get_smallest_best_parameters(group)
print("[database] No common kernels for: " + str(group_identifier) + " across all %d devices " % num_devices)
# Computes the amount of devices with shared parameters
parameters_count = defaultdict(int)
for i in range(0, num_devices):
for parameters in get_parameter_names(group[i]):
parameters_count[str(parameters)] += 1
num_devices_common = max(parameters_count.values())
# Fall back method in case there are no shared entries at all across devices
if num_devices_common == 1:
print("[database] Warning: No common kernels for: " + str(group_identifier) + " at all")
smallest_best_parameters = get_smallest_best_parameters(group)
if verbose:
print("[database] " + str(group_identifier))
return smallest_best_parameters
# Checks if perhaps there are many more shared parameters with a bit fewer devices
num_parameters_common = defaultdict(int)
for count in parameters_count.values():
if count != 1:
num_parameters_common[str(count)] += 1
if num_parameters_common[str(num_devices_common - 1)] > num_parameters_common[str(num_devices_common)]:
num_devices_common -= 1
if verbose:
print("[database] " + str(group_identifier))
return smallest_best_parameters
print("[database] Found %d common kernels for: " % num_parameters_common[str(num_devices_common)] +
str(group_identifier) + " across %d out of %d devices " % (num_devices_common, num_devices))
# Populates the common parameters
for parameters_string in parameters_count.keys():
count = parameters_count[parameters_string]
if count == num_devices_common:
parameters = ast.literal_eval(parameters_string)
common_parameters.append(parameters)
# Removes entries with parameters which are not common
common_results = []

180
scripts/generator/generator.py 100644 → 100755
View File

@ -12,6 +12,8 @@
# clblast.cpp
# clblast_c.h
# clblast_c.cpp
# clblast_netlib_c.h
# clblast_netlib_c.cpp
# wrapper_clblas.h
# wrapper_cblas.h
# It also generates the main functions for the correctness and performance tests as found in
@ -29,9 +31,18 @@ import generator.doc as doc
from generator.routine import Routine
from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU
HEADER_LINES = [96, 73, 97, 22, 29, 41]
FOOTER_LINES = [17, 75, 19, 14, 6, 6]
FILES = [
"/include/clblast.h",
"/src/clblast.cpp",
"/include/clblast_c.h",
"/src/clblast_c.cpp",
"/test/wrapper_clblas.hpp",
"/test/wrapper_cblas.hpp",
"/include/clblast_netlib_c.h",
"/src/clblast_netlib_c.cpp",
]
HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32]
FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]
# Different possibilities for requirements
ald_m = "The value of `a_ld` must be at least `m`."
@ -48,70 +59,105 @@ bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at leas
cld_m = "The value of `c_ld` must be at least `m`."
cld_n = "The value of `c_ld` must be at least `n`."
# Helper functions to compute vector and matrix sizes
def size_helper(condition, size_one, size_two, multiplier):
length = "(" + condition + ")" + " ? " + size_one + " * " + multiplier + " : " + size_two + " * " + multiplier
return length
def layout_transpose_condition(prefix):
return "(layout == CLBlastLayoutColMajor && " + prefix + "_transpose != CLBlastTransposeNo) || " +\
"(layout == CLBlastLayoutRowMajor && " + prefix + "_transpose == CLBlastTransposeNo)"
# Different possibilities for the vector and matrix sizes
xn = "n * x_inc"
xm = "m * x_inc"
yn = "n * y_inc"
ym = "m * y_inc"
an = "n * a_ld"
apn = "((n*(n+1)) / 2)"
cn = "n * c_ld"
xmn = size_helper("a_transpose != CLBlastTransposeNo", "m", "n", "x_inc")
ynm = size_helper("a_transpose != CLBlastTransposeNo", "n", "m", "y_inc")
amn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "a_ld")
amns = size_helper("side == CLBlastSideLeft", "m", "n", "a_ld")
amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld")
ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld")
ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld")
bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld")
bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld")
bmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "b_ld")
bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld")
cmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "c_ld")
ammn = size_helper("layout == CLBlastLayoutRowMajor", "m", "((side == CLBlastSideLeft) ? m : n)", "a_ld")
bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft) ? m : n)", "n", "b_ld")
# ==================================================================================================
# Populates a list of routines
ROUTINES = [
[ # Level 1: vector-vector
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []),
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []),
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], [xn,yn], ["cos","sin"],"", "Apply givens plane rotation", "", []),
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [xn,yn,"1"], [], "", "Apply modified givens plane rotation", "", []),
Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [xn,yn], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], [xn], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
],
[ # Level 2: matrix-vector
Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []),
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []),
# Level 2: matrix update
Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
],
[ # Level 3: matrix-matrix
Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "", []),
],
[ # Level X: extra routines (not part of BLAS)
Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
]]
@ -124,33 +170,23 @@ def main(argv):
cl_args = parser.parse_args(argv)
library_root = cl_args.clblast_root
# Sets all the files the output
files = [
library_root + "/include/clblast.h",
library_root + "/src/clblast.cpp",
library_root + "/include/clblast_c.h",
library_root + "/src/clblast_c.cpp",
library_root + "/test/wrapper_clblas.hpp",
library_root + "/test/wrapper_cblas.hpp",
]
# Checks whether the command-line arguments are valid; exists otherwise
for f in files:
if not os.path.isfile(f):
for f in FILES:
if not os.path.isfile(library_root + f):
print("[ERROR] The path '" + library_root + "' does not point to the root of the CLBlast library")
sys.exit()
# Iterates over all regular files to output
for i in range(0, len(files)):
for i in range(0, len(FILES)):
# Stores the header and the footer of the original file
with open(files[i]) as f:
with open(library_root + FILES[i]) as f:
original = f.readlines()
file_header = original[:HEADER_LINES[i]]
file_footer = original[-FOOTER_LINES[i]:]
# Re-writes the body of the file
with open(files[i], "w") as f:
with open(library_root + FILES[i], "w") as f:
body = ""
levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4]
for level in levels:
@ -168,6 +204,10 @@ def main(argv):
body += cpp.wrapper_clblas(routine)
if i == 5:
body += cpp.wrapper_cblas(routine)
if i == 6:
body += cpp.clblast_netlib_c_h(routine)
if i == 7:
body += cpp.clblast_netlib_c_cc(routine)
f.write("".join(file_header))
f.write(body)
f.write("".join(file_footer))

View File

@ -45,17 +45,18 @@ def clblast_h(routine):
def clblast_cc(routine):
"""The C++ API implementation (.cpp)"""
indent1 = " " * (20 + routine.length())
indent1 = " " * (15 + routine.length())
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
if routine.implemented:
result += routine.routine_header_cpp(12, "") + " {" + NL
result += " auto queue_cpp = Queue(*queue);" + NL
result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
result += " auto status = routine.SetUp();" + NL
result += " if (status != StatusCode::kSuccess) { return status; }" + NL
result += " return routine.Do" + routine.name.capitalize() + "("
result += " try {" + NL
result += " auto queue_cpp = Queue(*queue);" + NL
result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
result += " routine.Do" + routine.name.capitalize() + "("
result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
result += ");" + NL
result += " return StatusCode::kSuccess;" + NL
result += " } catch (...) { return DispatchException(); }" + NL
else:
result += routine.routine_header_type_cpp(12) + " {" + NL
result += " return StatusCode::kNotImplemented;" + NL
@ -72,7 +73,7 @@ def clblast_c_h(routine):
"""The C API header (.h)"""
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
for flavour in routine.flavours:
result += routine.routine_header_c(flavour, 31, " PUBLIC_API") + ";" + NL
result += routine.routine_header_c(flavour, 38, " PUBLIC_API") + ";" + NL
return result
@ -81,12 +82,89 @@ def clblast_c_cc(routine):
result = NL + "// " + routine.name.upper() + NL
for flavour in routine.flavours:
template = "<" + flavour.template + ">" if routine.no_scalars() else ""
indent = " " * (26 + routine.length() + len(template))
result += routine.routine_header_c(flavour, 20, "") + " {" + NL
result += " auto status = clblast::" + routine.name.capitalize() + template + "("
indent = " " * (16 + routine.length() + len(template))
result += routine.routine_header_c(flavour, 27, "") + " {" + NL
result += " try {" + NL
result += " return static_cast<CLBlastStatusCode>(" + NL
result += " clblast::" + routine.name.capitalize() + template + "("
result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
result += "," + NL + indent + "queue, event);"
result += NL + " return static_cast<StatusCode>(status);" + NL + "}" + NL
result += "," + NL + indent + "queue, event)" + NL
result += " );" + NL
result += " } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }" + NL
result += "}" + NL
return result
def clblast_netlib_c_h(routine):
"""The Netlib CBLAS API header (.h)"""
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
for flavour in routine.flavours:
if flavour.precision_name in ["S", "D", "C", "Z"]:
result += routine.routine_header_netlib(flavour, 20, " PUBLIC_API") + ";" + NL
return result
def clblast_netlib_c_cc(routine):
"""The Netlib CBLAS API implementation (.cpp)"""
result = NL + "// " + routine.name.upper() + NL
for flavour in routine.flavours:
# There is a version available in CBLAS
if flavour.precision_name in ["S", "D", "C", "Z"]:
template = "<" + flavour.template + ">" if routine.no_scalars() else ""
name_postfix = "_sub" if routine.name in routine.routines_scalar_no_return() else ""
indent = " " * (21 + routine.length() + len(template))
result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL
# Initialize OpenCL
result += " auto device = get_device();" + NL
result += " auto context = clblast::Context(device);" + NL
result += " auto queue = clblast::Queue(context, device);" + NL
# Set alpha and beta
result += "".join(" " + s + NL for s in routine.scalar_create_cpp(flavour))
# Copy data structures to the device
for i, name in enumerate(routine.inputs + routine.outputs):
result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL
for i, name in enumerate(routine.inputs + routine.outputs):
buffer_type = routine.get_buffer_type(name, flavour)
result += " " + routine.create_buffer(name, buffer_type) + NL
if name in routine.scalar_buffers_second_non_pointer():
result += " " + buffer_type + " " + name + "_vec[1]; " + name + "_vec[0] = " + name + ";" + NL
for name in routine.inputs + routine.outputs:
if name not in routine.scalar_buffers_first():
prefix = "" if name in routine.outputs else "const "
buffer_type = routine.get_buffer_type(name, flavour)
result += " " + routine.write_buffer(name, prefix + buffer_type) + NL
# The function call
result += " auto queue_cl = queue();" + NL
result += " auto s = clblast::" + routine.name.capitalize() + template + "("
result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)])
result += "," + NL + indent + "&queue_cl);" + NL
# Error handling
result += " if (s != clblast::StatusCode::kSuccess) {" + NL
result += " throw std::runtime_error(\"CLBlast returned with error code \" + clblast::ToString(s));" + NL
result += " }" + NL
# Copy back and clean-up
for name in routine.outputs:
if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return():
buffer_type = routine.get_buffer_type(name, flavour)
result += " " + buffer_type + " " + name + "[" + name + "_size];" + NL
for name in routine.outputs:
buffer_type = routine.get_buffer_type(name, flavour)
result += " " + routine.read_buffer(name, buffer_type) + NL
for name in routine.outputs:
if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return():
result += " return " + name + "[0]"
if flavour.buffer_type in ["float2", "double2"]:
if name not in routine.index_buffers():
result += ".real()"
result += ";" + NL
result += "}" + NL
return result
@ -218,8 +296,9 @@ def performance_test(routine, level_string):
result += "using double2 = clblast::double2;" + NL + NL
result += "// Main function (not within the clblast namespace)" + NL
result += "int main(int argc, char *argv[]) {" + NL
result += " const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);" + NL
default = convert.precision_to_full_name(routine.flavours[0].precision_name)
result += " switch(clblast::GetPrecision(argc, argv, clblast::Precision::k" + default + ")) {" + NL
result += " switch(clblast::GetPrecision(command_line_args, clblast::Precision::k" + default + ")) {" + NL
for precision in ["H", "S", "D", "C", "Z"]:
result += " case clblast::Precision::k" + convert.precision_to_full_name(precision) + ":"
found = False

View File

@ -54,6 +54,22 @@ class DataType:
return self.beta_cl + "{{beta.real(), beta.imag()}}"
return "beta"
def use_alpha_clblast(self):
"""Transforms a Netlib CBLAS parameter to CLBlast style"""
if self.alpha_cpp == D_FLOAT2:
return self.alpha_cpp + "{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}"
elif self.alpha_cpp == D_DOUBLE2:
return self.alpha_cpp + "{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}"
return "alpha"
def use_beta_clblast(self):
"""As above, but for beta instead of alpha"""
if self.beta_cpp == D_FLOAT2:
return self.beta_cpp + "{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}"
elif self.beta_cpp == D_DOUBLE2:
return self.beta_cpp + "{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}"
return "beta"
def test_template(self):
"""Returns the template as used in the correctness/performance tests"""
if self.buffer_type != self.beta_cpp:
@ -65,6 +81,10 @@ class DataType:
return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or
(scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2]))
def is_non_standard(self):
"""Current type is of a non-standard type"""
return self.buffer_type in [D_HALF, D_FLOAT2, D_DOUBLE2]
# Regular data-types
H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF) # half (16)

View File

@ -32,7 +32,7 @@ def generate(routine):
result += "C API:" + NL
result += "```" + NL
for flavour in routine.flavours:
result += routine.routine_header_c(flavour, 20, "") + NL
result += routine.routine_header_c(flavour, 27, "") + NL
result += "```" + NL + NL
# Routine arguments

View File

@ -13,7 +13,8 @@ import generator.convert as convert
class Routine:
"""Class holding routine-specific information (e.g. name, which arguments, which precisions)"""
def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
inputs, outputs, scalars, scratch, description, details, requirements):
inputs, outputs, buffer_sizes, scalars, scratch,
description, details, requirements):
self.implemented = implemented
self.has_tests = has_tests
self.level = level
@ -24,6 +25,7 @@ class Routine:
self.options = options
self.inputs = inputs
self.outputs = outputs
self.buffer_sizes = buffer_sizes
self.scalars = scalars
self.scratch = scratch # Scratch buffer (e.g. for xDOT)
self.description = description
@ -40,6 +42,11 @@ class Routine:
"""List of scalar buffers"""
return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"]
@staticmethod
def scalar_buffers_second_non_pointer():
"""As above, but these ones are not passed as pointers but as scalars instead"""
return ["sy1"]
@staticmethod
def other_scalars():
"""List of scalars other than alpha and beta"""
@ -65,6 +72,34 @@ class Routine:
"""Distinguish between vectors and matrices"""
return ["a", "b", "c", "ap"]
@staticmethod
def routines_scalar_no_return():
return ["dotu", "dotc"]
@staticmethod
def set_size(name, size):
"""Sets the size of a buffer"""
return "const auto " + name + "_size = " + size + ";"
@staticmethod
def create_buffer(name, template):
"""Creates a new CLCudaAPI buffer"""
return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);"
def write_buffer(self, name, template):
"""Writes to a CLCudaAPI buffer"""
postfix = ""
if name in self.scalar_buffers_second_non_pointer():
postfix = "_vec"
data_structure = "reinterpret_cast<" + template + "*>(" + name + postfix + ")"
return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");"
@staticmethod
def read_buffer(name, template):
"""Reads from a CLCudaAPI buffer"""
data_structure = "reinterpret_cast<" + template + "*>(" + name + ")"
return name + "_buffer.Read(queue, " + name + "_size, " + data_structure + ");"
def non_index_inputs(self):
"""Lists of input/output buffers not index (integer)"""
buffers = self.inputs[:] # make a copy
@ -85,6 +120,11 @@ class Routine:
"""List of buffers without 'inc' or 'ld'"""
return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap"]
def get_buffer_type(self, name, flavour):
if name in self.index_buffers():
return "int"
return flavour.buffer_type
def length(self):
"""Retrieves the number of characters in the routine's name"""
return len(self.name)
@ -133,6 +173,15 @@ class Routine:
return [", ".join(a + b + c)]
return []
def buffer_zero_offset(self, name):
"""As above, but with an offset value of zero"""
if name in self.inputs or name in self.outputs:
a = [name + "_buffer()"]
b = ["0"]
c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
return [", ".join(a + b + c)]
return []
def buffer_def(self, name):
"""As above but with data-types"""
prefix = "const " if name in self.inputs else ""
@ -163,6 +212,17 @@ class Routine:
return [", ".join(a + b + c)]
return []
def buffer_def_pointer(self, name, flavour):
"""As above but as plain C pointer"""
prefix = "const " if name in self.inputs else ""
if name in self.inputs or name in self.outputs:
data_type = "void" if flavour.is_non_standard() else flavour.buffer_type
pointer = "" if name in self.scalar_buffers_second_non_pointer() else "*"
a = [prefix + data_type + pointer + " " + name + ""]
c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
return [", ".join(a + c)]
return []
def buffer_clcudaapi(self, name):
"""As above but with CLCudaAPI buffers"""
if name in self.inputs or name in self.outputs:
@ -238,6 +298,12 @@ class Routine:
return [name]
return []
def scalar_cpp(self, name):
"""As above, but with _cpp as a suffix"""
if name in self.scalars:
return [name + "_cpp"]
return []
def scalar_half_to_float(self, name):
"""As above, but converts from float to half"""
if name in self.scalars:
@ -288,6 +354,16 @@ class Routine:
return ["const " + flavour.beta_cpp + " " + name]
return []
def scalar_def_void(self, name, flavour):
"""Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types"""
if name in self.scalars:
if name == "alpha":
data_type = "void*" if flavour.is_complex("alpha") else flavour.alpha_cpp
return ["const " + data_type + " " + name]
data_type = "void*" if flavour.is_complex("beta") else flavour.beta_cpp
return ["const " + data_type + " " + name]
return []
def scalar_type(self, name, flavour):
"""Retrieves the type of a scalar (alpha/beta)"""
if name in self.scalars:
@ -304,6 +380,16 @@ class Routine:
return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."]
return []
def scalar_create_cpp(self, flavour):
"""Creates a C++ version of a scalar based on a void*"""
result = []
for name in self.scalars:
if name == "alpha":
result.append("const auto alpha_cpp = " + flavour.use_alpha_clblast() + ";")
elif name == "beta":
result.append("const auto beta_cpp = " + flavour.use_beta_clblast() + ";")
return result
def sizes_list(self):
"""Retrieves a list of comma-separated sizes (m, n, k)"""
if self.sizes:
@ -316,6 +402,12 @@ class Routine:
return [", ".join(["const size_t " + s for s in self.sizes])]
return []
def sizes_def_netlib(self):
"""Retrieves the definition of the sizes (m,n,k) for the CBLAS API"""
if self.sizes:
return [", ".join(["const int " + s for s in self.sizes])]
return []
def sizes_type(self):
"""Retrieves the types of the sizes (m,n,k)"""
if self.sizes:
@ -349,6 +441,13 @@ class Routine:
return [", ".join(definitions)]
return []
def options_def_c(self):
"""As above, but now for the C API"""
if self.options:
definitions = ["const CLBlast" + convert.option_to_clblast(o) + " " + o for o in self.options]
return [", ".join(definitions)]
return []
def options_def_wrapper_clblas(self):
"""As above, but now using clBLAS data-types"""
if self.options:
@ -421,6 +520,17 @@ class Routine:
list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])))
def arguments_netlib(self, flavour, indent):
"""As above, but for the Netlib CBLAS API"""
return (self.options_cast(indent) + self.sizes_list() +
list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_first()])) +
self.scalar_cpp("alpha") +
list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_first()])) +
self.scalar_cpp("beta") +
list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar(s) for s in self.other_scalars()])))
def arguments_wrapper_clblas(self, flavour):
"""As above, but for the clBLAS wrapper"""
return (self.options_list() + self.sizes_list() +
@ -453,6 +563,30 @@ class Routine:
list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
def arguments_def_netlib(self, flavour):
"""As above, but for the Netlib CBLAS API"""
result=(self.options_def_c() + self.sizes_def_netlib() +
self.scalar_def_void("alpha", flavour) +
list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) +
self.scalar_def_void("beta", flavour) +
list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) +
list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
if self.name in self.routines_scalar_no_return():
result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()]))
return result
def arguments_def_c(self, flavour):
"""As above, but for the C API"""
return (self.options_def_c() + self.sizes_def() +
list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) +
self.scalar_def("alpha", flavour) +
list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) +
self.scalar_def("beta", flavour) +
list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
def arguments_def_wrapper_clblas(self, flavour):
"""As above, but clBLAS wrapper plain data-types"""
return (self.options_def_wrapper_clblas() + self.sizes_def() +
@ -523,11 +657,30 @@ class Routine:
def routine_header_c(self, flavour, spaces, extra_qualifier):
"""As above, but now for C"""
indent = " " * (spaces + self.length())
result = "StatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
result += (",\n" + indent).join([a for a in self.arguments_def(flavour)])
result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
result += (",\n" + indent).join([a for a in self.arguments_def_c(flavour)])
result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
return result
def routine_header_netlib(self, flavour, spaces, extra_qualifier):
"""As above, but now for the original Netlib CBLAS API"""
return_type = "void"
for output in self.outputs:
if output in self.index_buffers():
return_type = "int"
break
if output in self.scalar_buffers_first() and self.name not in self.routines_scalar_no_return():
return_type = flavour.buffer_type.replace("2", "")
break
indent = " " * (spaces + len(return_type) + self.length())
routine_name = self.name
if self.name in self.routines_scalar_no_return():
routine_name += "_sub"
indent += " "
result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "("
result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")"
return result
def routine_header_wrapper_clblas(self, flavour, def_only, spaces):
"""As above, but now for the clBLAS wrapper"""
template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else ""

View File

@ -17,7 +17,6 @@ purplish = "#550077" # [ 85, 0,119] lumi=26
blueish = "#4765b1" # [ 71,101,177] lumi=100
redish = "#d67568" # [214,117,104] lumi=136
greenish = "#9bd4ca" # [155,212,202] lumi=199
colourset = c(blueish, redish, greenish, purplish)
# Sets the graph markers (circles, triangles, etc.)
pchs = c(15, 18, 17, 12)
@ -31,11 +30,14 @@ options("width"=170)
# ==================================================================================================
# Constants
num_runs <- 4
# Settings
num_runs <- 5
num_runs_short <- 50
xtics_subset_threshold <- 100
xtics_subset_stepsize <- 8
devices <- c("-platform","-device")
options_string <- "-q -no_abbrv -cblas 0"
library_names <- c("CLBlast", "clBLAS")
# Command-line arguments
command_line <- commandArgs(trailingOnly=TRUE)
@ -50,6 +52,19 @@ device_id <- command_line[2]
devices_values <- c(platform_id, device_id)
devices_string <- paste(devices, devices_values, collapse=" ")
# Filter the string: only lines containing a ";" can be valid lines
filter_string <- function(raw_result_string) {
result_string <- c()
for (line in raw_result_string) {
if (grepl(";",line)) {
result_string <-
c(result_string, line)
}
}
return(result_string)
}
# ==================================================================================================
# The main function
@ -65,12 +80,28 @@ main <- function(routine_name, precision, test_names, test_values,
if (precision == 6464) { display_name <- gsub("^X","Z",display_name); }
executable <- paste("./clblast_client_", routine_name, sep="")
# Display
library_names <- c("CLBlast", "clBLAS")
if (precision == 16) { library_names <- c("CLBlast FP16", "CLBlast FP32", "clBLAS FP32"); }
colourset <- c(blueish, redish)
if (precision == 16) { colourset <- c(blueish, purplish, redish); }
# Configures the outputfile
pdf(paste(display_name, ".pdf", sep=""), height=8, width=13)
par(mfrow=c(2, 3))
par(oma=c(0, 0, 0, 0))
par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
file_name <- paste(display_name, ".pdf", sep="")
if (length(test_names) == 6) {
pdf(file_name, height=8, width=13)
par(mfrow=c(2, 3))
par(oma=c(0, 0, 0, 0))
par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
}
else { # length(test_names) == 2
pdf(file_name, height=8, width=13)
par(mfrow=c(2, 1))
par(oma=c(0, 0, 0, 0))
par(mar=c(4.6, 4.4, 1.5, 0)) # bottom, left, top, right [c(5.1, 4.1, 4.1, 2.1)]
par(mgp=c(2.8, 0.6, 0)) # location of xlab/ylab, tick-mark labels, tick marks [c(3, 1, 0)]
}
# Loops over the test-cases
for (test_id in 1:length(test_names)) {
@ -84,19 +115,32 @@ main <- function(routine_name, precision, test_names, test_values,
arguments <- paste(devices_string, params_string, options_string, sep=" ")
print(paste("Running", executable, arguments, sep=" "))
raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
# Filter the string: only lines containing a ";" can be valid lines
result_string <- c()
for (line in raw_result_string) {
if (grepl(";",line)) {
result_string <-
c(result_string, line)
}
}
result_string <- filter_string(raw_result_string)
# Reads the result into a dataframe
command_db <- read.csv(text=result_string, sep=";")
# For half-precision: also runs the FP32 version for comparison
if (precision == 16) {
params_string <- gsub("-precision 16", "-precision 32", params_string)
arguments <- paste(devices_string, params_string, options_string, sep=" ")
print(paste("Running", executable, arguments, sep=" "))
raw_result_string <- system2(command=executable, args=arguments, stdout=TRUE)
result_string <- filter_string(raw_result_string)
# Reads the result into a dataframe
command_db_32 <- read.csv(text=result_string, sep=";")
stopifnot(nrow(command_db) == nrow(command_db_32))
# Combines the results
command_db["ms_FP32_1"] = command_db_32$ms_1
command_db["GFLOPS_FP32_1"] = command_db_32$GFLOPS_1
command_db["GBs_FP32_1"] = command_db_32$GBs_1
command_db["ms_FP32_2"] = command_db_32$ms_2
command_db["GFLOPS_FP32_2"] = command_db_32$GFLOPS_2
command_db["GBs_FP32_2"] = command_db_32$GBs_2
}
# Append the results to the final dataframe
if (command_id == 1) {
db <- command_db
@ -120,22 +164,36 @@ main <- function(routine_name, precision, test_names, test_values,
# Plots the graph with GFLOPS on the Y-axis
if (metric_gflops) {
plot_graph(xdata=xdata, ydata=list(db$GFLOPS_1, db$GFLOPS_2), log_setting=log_scale,
if (precision == 16) {
ydata = list(db$GFLOPS_1, db$GFLOPS_FP32_1, db$GFLOPS_FP32_2)
ymax = max(max(db$GFLOPS_1), max(db$GFLOPS_FP32_1), max(db$GFLOPS_FP32_2))
} else {
ydata = list(db$GFLOPS_1, db$GFLOPS_2)
ymax = max(max(db$GFLOPS_1), max(db$GFLOPS_2))
}
plot_graph(xdata=xdata, ydata=ydata, log_setting=log_scale,
xmin=min(xdata), xmax=max(xdata),
ymin=0, ymax=max(max(db$GFLOPS_1),max(db$GFLOPS_2)),
ymin=0, ymax=ymax,
xtics=xtics,
xlabel=test_xlabels[[test_id]], ylabel="GFLOPS (higher is better)",
graph_title=paste(display_name, test_names[[test_id]], sep=" "),
multiple=50, experiment_names=library_names)
multiple=50, experiment_names=library_names, colourset=colourset)
# Plots the graph with GB/s on the Y-axis
} else {
plot_graph(xdata=xdata, ydata=list(db$GBs_1, db$GBs_2), log_setting=log_scale,
if (precision == 16) {
ydata = list(db$GBs_1, db$GBs_FP32_1, db$GBs_FP32_2)
ymax = max(max(db$GBs_1), max(db$GBs_FP32_1), max(db$GBs_FP32_2))
} else {
ydata = list(db$GBs_1, db$GBs_2)
ymax = max(max(db$GBs_1), max(db$GBs_2))
}
plot_graph(xdata=xdata, ydata=ydata, log_setting=log_scale,
xmin=min(xdata), xmax=max(xdata),
ymin=0, ymax=max(max(db$GBs_1),max(db$GBs_2)),
ymin=0, ymax=ymax,
xtics=xtics,
xlabel=test_xlabels[[test_id]], ylabel="GB/s (higher is better)",
graph_title=paste(display_name, test_names[[test_id]], sep=" "),
multiple=10, experiment_names=library_names)
multiple=10, experiment_names=library_names, colourset=colourset)
}
}
}
@ -147,7 +205,7 @@ plot_graph <- function(xdata, ydata, log_setting,
xmin, xmax, ymin, ymax,
xtics, xlabel, ylabel,
graph_title,
multiple, experiment_names) {
multiple, experiment_names, colourset) {
# Update the ymax to the next multiple of something
ymax <- multiple*ceiling(ymax/multiple)
@ -169,7 +227,12 @@ plot_graph <- function(xdata, ydata, log_setting,
main="", xlab="", ylab="",
ylim=c(ymin, ymax), xlim=c(xmin, xmax), axes=F, "n")
axis(side=2, las=2)
axis(side=1, at=xdata, labels=xtics, las=2)
if (length(xdata) > xtics_subset_threshold) { # Too many indices to print, plot only every Nth
subset <- seq(from=1, to=length(xdata), by=xtics_subset_stepsize)
axis(side=1, at=xdata[subset], labels=xtics[subset], las=2)
} else {
axis(side=1, at=xdata, labels=xtics, las=2)
}
title(xlab=xlabel, line=-1)
title(ylab=ylabel, line=2)
title(graph_title, line=-2)

View File

@ -0,0 +1,56 @@
# ==================================================================================================
# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
# project uses a tab-size of two spaces and a max-width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# This file implements the performance script for small sizes of Xgemm, testing the direct kernel
#
# ==================================================================================================
# Includes the common functions
args <- commandArgs(trailingOnly = FALSE)
thisfile <- (normalizePath(sub("--file=", "", args[grep("--file=", args)])))
source(file.path(dirname(thisfile), "common.r"))
# ==================================================================================================
# Settings
routine_name <- "xgemm"
parameters <- c("-m","-n","-k","-layout","-transA","-transB",
"-num_steps","-step","-runs","-precision")
precision <- 32
# Sets the names of the test-cases
test_names <- list(
"small matrices in steps of 16",
"small matrices in steps of 1"
)
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 128, 102, 111, 111, 57, 16, num_runs_short, precision)),
list(c( 128, 128, 128, 102, 111, 111, 385, 1, num_runs_short, precision))
)
# Defines the x-labels corresponding to the test-cases
test_xlabels <- list(
"matrix sizes (m=n=k)",
"matrix sizes (m=n=k)"
)
# Defines the x-axis of the test-cases
test_xaxis <- list(
c("m", ""),
c("m", "")
)
# ==================================================================================================
# Start the script
main(routine_name=routine_name, precision=precision, test_names=test_names, test_values=test_values,
test_xlabels=test_xlabels, test_xaxis=test_xaxis, metric_gflops=TRUE)
# ==================================================================================================

View File

@ -35,32 +35,32 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)),
list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
list(c( 128, 128, 102, 141, 121, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 141, 121, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 141, 121, 16, 1, num_runs, precision)),
list(c(2048, 2048, 102, 141, 121, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
c(1024, 1024, 101, 141, 121, 1, 0, num_runs, precision),
c(1024, 1024, 101, 141, 122, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 121, 1, 0, num_runs, precision),
c(1024, 1024, 101, 142, 122, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 122, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 121, 1, 0, num_runs, precision),
c(1024, 1024, 102, 142, 122, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision),
c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision),
c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision),
c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision),
c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision),
c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision),
c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
c( 8, 8, 102, 141, 121, 1, 0, num_runs, precision),
c( 16, 16, 102, 141, 121, 1, 0, num_runs, precision),
c( 32, 32, 102, 141, 121, 1, 0, num_runs, precision),
c( 64, 64, 102, 141, 121, 1, 0, num_runs, precision),
c( 128, 128, 102, 141, 121, 1, 0, num_runs, precision),
c( 256, 256, 102, 141, 121, 1, 0, num_runs, precision),
c( 512, 512, 102, 141, 121, 1, 0, num_runs, precision),
c(1024, 1024, 102, 141, 121, 1, 0, num_runs, precision),
c(2048, 2048, 102, 141, 121, 1, 0, num_runs, precision),
c(4096, 4096, 102, 141, 121, 1, 0, num_runs, precision),
c(8192, 8192, 102, 141, 121, 1, 0, num_runs, precision)
)
)

View File

@ -35,32 +35,32 @@ test_names <- list(
# Defines the test-cases
test_values <- list(
list(c( 128, 128, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 111, 111, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 111, 111, 16, 1, num_runs, precision)),
list(c(2048, 2048, 102, 111, 111, 16, 1, num_runs, precision)),
list(c( 128, 128, 102, 121, 111, 16, 128, num_runs, precision)),
list(c( 129, 129, 102, 121, 111, 16, 128, num_runs, precision)),
list(c( 512, 512, 102, 121, 111, 16, 1, num_runs, precision)),
list(c(2048, 2048, 102, 121, 111, 16, 1, num_runs, precision)),
list(
c(1024, 1024, 101, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 112, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 112, 112, 1, 0, num_runs, precision)
c(1024, 1024, 101, 121, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 121, 112, 1, 0, num_runs, precision),
c(1024, 1024, 101, 122, 111, 1, 0, num_runs, precision),
c(1024, 1024, 101, 122, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 121, 112, 1, 0, num_runs, precision),
c(1024, 1024, 102, 122, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 122, 112, 1, 0, num_runs, precision)
),
list(
c( 8, 8, 102, 111, 111, 1, 0, num_runs, precision),
c( 16, 16, 102, 111, 111, 1, 0, num_runs, precision),
c( 32, 32, 102, 111, 111, 1, 0, num_runs, precision),
c( 64, 64, 102, 111, 111, 1, 0, num_runs, precision),
c( 128, 128, 102, 111, 111, 1, 0, num_runs, precision),
c( 256, 256, 102, 111, 111, 1, 0, num_runs, precision),
c( 512, 512, 102, 111, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 111, 111, 1, 0, num_runs, precision),
c(2048, 2048, 102, 111, 111, 1, 0, num_runs, precision),
c(4096, 4096, 102, 111, 111, 1, 0, num_runs, precision),
c(8192, 8192, 102, 111, 111, 1, 0, num_runs, precision)
c( 8, 8, 102, 121, 111, 1, 0, num_runs, precision),
c( 16, 16, 102, 121, 111, 1, 0, num_runs, precision),
c( 32, 32, 102, 121, 111, 1, 0, num_runs, precision),
c( 64, 64, 102, 121, 111, 1, 0, num_runs, precision),
c( 128, 128, 102, 121, 111, 1, 0, num_runs, precision),
c( 256, 256, 102, 121, 111, 1, 0, num_runs, precision),
c( 512, 512, 102, 121, 111, 1, 0, num_runs, precision),
c(1024, 1024, 102, 121, 111, 1, 0, num_runs, precision),
c(2048, 2048, 102, 121, 111, 1, 0, num_runs, precision),
c(4096, 4096, 102, 121, 111, 1, 0, num_runs, precision),
c(8192, 8192, 102, 121, 111, 1, 0, num_runs, precision)
)
)

View File

@ -1,121 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
// templated and thus header-only.
//
// =================================================================================================
#ifndef CLBLAST_BUFFER_TEST_H_
#define CLBLAST_BUFFER_TEST_H_
#include "clblast.h"
namespace clblast {
// =================================================================================================
// Tests matrix 'A' for validity
template <typename T>
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
return StatusCode::kSuccess;
}
// Tests matrix 'B' for validity
template <typename T>
StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
} catch (...) { return StatusCode::kInvalidMatrixB; }
return StatusCode::kSuccess;
}
// Tests matrix 'C' for validity
template <typename T>
StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
} catch (...) { return StatusCode::kInvalidMatrixC; }
return StatusCode::kSuccess;
}
// Tests matrix 'AP' for validity
template <typename T>
StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
return StatusCode::kSuccess;
}
// =================================================================================================
// Tests vector 'X' for validity
template <typename T>
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc) {
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
try {
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
} catch (...) { return StatusCode::kInvalidVectorX; }
return StatusCode::kSuccess;
}
// Tests vector 'Y' for validity
template <typename T>
StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc) {
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
try {
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
} catch (...) { return StatusCode::kInvalidVectorY; }
return StatusCode::kSuccess;
}
// =================================================================================================
// Tests vector 'scalar' for validity
template <typename T>
StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (n + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
} catch (...) { return StatusCode::kInvalidVectorScalar; }
return StatusCode::kSuccess;
}
// Tests vector 'index' for validity
template <typename T>
StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (n + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
} catch (...) { return StatusCode::kInvalidVectorScalar; }
return StatusCode::kSuccess;
}
// =================================================================================================
} // namespace clblast
// CLBLAST_BUFFER_TEST_H_
#endif

View File

@ -57,7 +57,7 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
}
}
binary_cache_mutex_.unlock();
throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none.");
throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
}
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
@ -75,7 +75,7 @@ const Program& GetProgramFromCache(const Context &context, const Precision &prec
}
}
program_cache_mutex_.unlock();
throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
}
// Queries the cache to see whether or not the compiled kernel is already there
@ -109,14 +109,13 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
// =================================================================================================
// Clears the cache of stored binaries and programs
StatusCode CacheClearAll() {
void CacheClearAll() {
binary_cache_mutex_.lock();
binary_cache_.clear();
binary_cache_mutex_.unlock();
program_cache_mutex_.lock();
program_cache_.clear();
program_cache_mutex_.unlock();
return StatusCode::kSuccess;
}
// =================================================================================================

View File

@ -18,7 +18,7 @@
#include <vector>
#include <mutex>
#include "utilities.hpp"
#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
@ -89,7 +89,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
// =================================================================================================
// Clears the cache of stored binaries
StatusCode CacheClearAll();
void CacheClearAll();
// =================================================================================================
} // namespace clblast

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -12,8 +12,8 @@
// Portability here means that a similar header exists for CUDA with the same classes and
// interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
//
// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore
// contains the following header copyright notice:
// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
// therefore contains the following header copyright notice:
//
// =================================================================================================
//
@ -41,30 +41,52 @@
#include <string> // std::string
#include <vector> // std::vector
#include <memory> // std::shared_ptr
#include <stdexcept> // std::runtime_error
#include <numeric> // std::accumulate
#include <cstring> // std::strlen
// OpenCL
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Exception classes
#include "cxpp11_common.hpp"
namespace clblast {
// =================================================================================================
// Error occurred in the C++11 OpenCL header (this file)
inline void Error(const std::string &message) {
throw std::runtime_error("Internal OpenCL error: "+message);
}
// Represents a runtime error returned by an OpenCL API function
class CLError : public ErrorCode<DeviceError, cl_int> {
public:
explicit CLError(cl_int status, const std::string &where):
ErrorCode(status,
where,
"OpenCL error: " + where + ": " + std::to_string(static_cast<int>(status))) {
}
static void Check(const cl_int status, const std::string &where) {
if (status != CL_SUCCESS) {
throw CLError(status, where);
}
}
static void CheckDtor(const cl_int status, const std::string &where) {
if (status != CL_SUCCESS) {
fprintf(stderr, "CLBlast: %s (ignoring)\n", CLError(status, where).what());
}
}
};
// =================================================================================================
// Error occurred in OpenCL
inline void CheckError(const cl_int status) {
if (status != CL_SUCCESS) {
throw std::runtime_error("Internal OpenCL error: "+std::to_string(status));
}
}
#define CheckError(call) CLError::Check(call, CLError::TrimCallString(#call))
// Error occured in OpenCL (no-exception version for destructors)
#define CheckErrorDtor(call) CLError::CheckDtor(call, CLError::TrimCallString(#call))
// =================================================================================================
@ -81,7 +103,7 @@ class Event {
// Regular constructor with memory management
explicit Event():
event_(new cl_event, [](cl_event* e) {
if (*e) { CheckError(clReleaseEvent(*e)); }
if (*e) { CheckErrorDtor(clReleaseEvent(*e)); }
delete e;
}) {
*event_ = nullptr;
@ -92,19 +114,18 @@ class Event {
CheckError(clWaitForEvents(1, &(*event_)));
}
// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
// the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
// Retrieves the elapsed time of the last recorded event.
// (Note that there is a bug in Apple's OpenCL implementation of the 'clGetEventProfilingInfo' function:
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx)
// However, in our case the reply size is fixed to be cl_ulong, so we are not affected.
float GetElapsedTime() const {
WaitForCompletion();
auto bytes = size_t{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
auto time_start = size_t{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
auto time_end = size_t{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
return (time_end - time_start) * 1.0e-6f;
const auto bytes = sizeof(cl_ulong);
auto time_start = cl_ulong{0};
CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr));
auto time_end = cl_ulong{0};
CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr));
return static_cast<float>(time_end - time_start) * 1.0e-6f;
}
// Accessor to the private data-member
@ -132,10 +153,14 @@ class Platform {
explicit Platform(const size_t platform_id) {
auto num_platforms = cl_uint{0};
CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
if (num_platforms == 0) { Error("no platforms found"); }
if (num_platforms == 0) {
throw RuntimeError("Platform: no platforms found");
}
if (platform_id >= num_platforms) {
throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id));
}
auto platforms = std::vector<cl_platform_id>(num_platforms);
CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
platform_ = platforms[platform_id];
}
@ -152,6 +177,17 @@ class Platform {
cl_platform_id platform_;
};
// Retrieves a vector with all platforms
inline std::vector<Platform> GetAllPlatforms() {
auto num_platforms = cl_uint{0};
CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
auto all_platforms = std::vector<Platform>();
for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
all_platforms.push_back(Platform(platform_id));
}
return all_platforms;
}
// =================================================================================================
// C++11 version of 'cl_device_id'
@ -164,11 +200,16 @@ class Device {
// Initialize the device. Note that this constructor can throw exceptions!
explicit Device(const Platform &platform, const size_t device_id) {
auto num_devices = platform.NumDevices();
if (num_devices == 0) { Error("no devices found"); }
if (num_devices == 0) {
throw RuntimeError("Device: no devices found");
}
if (device_id >= num_devices) {
throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
}
auto devices = std::vector<cl_device_id>(num_devices);
CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
devices.data(), nullptr));
if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
device_ = devices[device_id];
}
@ -201,8 +242,8 @@ class Device {
std::vector<size_t> MaxWorkItemSizes() const {
return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
}
cl_ulong LocalMemSize() const {
return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
unsigned long LocalMemSize() const {
return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
}
std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
size_t CoreClock() const {
@ -238,9 +279,11 @@ class Device {
// Query for a specific type of device or brand
bool IsCPU() const { return Type() == "CPU"; }
bool IsGPU() const { return Type() == "GPU"; }
bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
Vendor() == "AuthenticAMD";; }
bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
Vendor() == "GenuineIntel"; }
bool IsARM() const { return Vendor() == "ARM"; }
// Accessor to the private data-member
@ -271,7 +314,8 @@ class Device {
auto result = std::string{};
result.resize(bytes);
CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
return std::string{result.c_str()}; // Removes any trailing '\0'-characters
result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
return result;
}
};
@ -289,11 +333,11 @@ class Context {
// Regular constructor with memory management
explicit Context(const Device &device):
context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) {
context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) {
auto status = CL_SUCCESS;
const cl_device_id dev = device();
*context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
CheckError(status);
CLError::Check(status, "clCreateContext");
}
// Accessor to the private data-member
@ -318,18 +362,18 @@ class Program {
// Source-based constructor with memory management
explicit Program(const Context &context, std::string source):
program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
length_(source.length()),
source_(std::move(source)),
source_ptr_(&source_[0]) {
auto status = CL_SUCCESS;
*program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
CheckError(status);
CLError::Check(status, "clCreateProgramWithSource");
}
// Binary-based constructor with memory management
explicit Program(const Device &device, const Context &context, const std::string& binary):
program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
length_(binary.length()),
source_(binary),
source_ptr_(&source_[0]) {
@ -339,25 +383,16 @@ class Program {
*program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
reinterpret_cast<const unsigned char**>(&source_ptr_),
&status1, &status2);
CheckError(status1);
CheckError(status2);
CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
CLError::Check(status2, "clCreateProgramWithBinary");
}
// Compiles the device program and returns whether or not there where any warnings/errors
BuildStatus Build(const Device &device, std::vector<std::string> &options) {
void Build(const Device &device, std::vector<std::string> &options) {
options.push_back("-cl-std=CL1.1");
auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
const cl_device_id dev = device();
auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr);
if (status == CL_BUILD_PROGRAM_FAILURE) {
return BuildStatus::kError;
}
else if (status == CL_INVALID_BINARY) {
return BuildStatus::kInvalid;
}
else {
CheckError(status);
return BuildStatus::kSuccess;
}
CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
}
// Retrieves the warning/error message from the compiler (if any)
@ -405,24 +440,11 @@ class Queue {
// Regular constructor with memory management
explicit Queue(const Context &context, const Device &device):
queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s));
delete s; }) {
auto status = CL_SUCCESS;
#ifdef CL_VERSION_2_0
size_t ocl_version = device.VersionNumber();
if (ocl_version >= 200)
{
cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
*queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
}
else
{
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
}
#else
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
#endif
CheckError(status);
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
CLError::Check(status, "clCreateCommandQueue");
}
// Synchronizes the queue
@ -514,7 +536,7 @@ class Buffer {
if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
auto status = CL_SUCCESS;
*buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
CheckError(status);
CLError::Check(status, "clCreateBuffer");
}
// As above, but now with read/write access as a default
@ -535,18 +557,24 @@ class Buffer {
// Copies from device to host: reading the device buffer a-synchronously
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
if (access_ == BufferAccess::kWriteOnly) {
throw LogicError("Buffer: reading from a write-only buffer");
}
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
if (host.size() < size) {
throw LogicError("Buffer: target host buffer is too small");
}
ReadAsync(queue, size, host.data(), offset);
}
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
if (host.size() < size) {
throw LogicError("Buffer: target host buffer is too small");
}
ReadAsync(queue, size, host.data(), offset);
}
@ -566,8 +594,12 @@ class Buffer {
// Copies from host to device: writing the device buffer a-synchronously
void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
if (access_ == BufferAccess::kReadOnly) {
throw LogicError("Buffer: writing to a read-only buffer");
}
if (GetSize() < (offset+size)*sizeof(T)) {
throw LogicError("Buffer: target device buffer is too small");
}
CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
@ -606,8 +638,7 @@ class Buffer {
// Retrieves the actual allocated size in bytes
size_t GetSize() const {
auto bytes = size_t{0};
CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
const auto bytes = sizeof(size_t);
auto result = size_t{0};
CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
return result;
@ -634,10 +665,10 @@ class Kernel {
// Regular constructor with memory management
explicit Kernel(const Program &program, const std::string &name):
kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) {
kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
auto status = CL_SUCCESS;
*kernel_ = clCreateKernel(program(), name.c_str(), &status);
CheckError(status);
CLError::Check(status, "clCreateKernel");
}
// Sets a kernel argument at the indicated position
@ -658,17 +689,16 @@ class Kernel {
}
// Retrieves the amount of local memory used per work-group for this kernel
cl_ulong LocalMemUsage(const Device &device) const {
auto bytes = size_t{0};
unsigned long LocalMemUsage(const Device &device) const {
const auto bytes = sizeof(cl_ulong);
auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
auto result = cl_ulong{0};
CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
return result;
return static_cast<unsigned long>(result);
}
// Retrieves the name of the kernel
std::string GetFunctionName() {
std::string GetFunctionName() const {
auto bytes = size_t{0};
CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
auto result = std::string{};
@ -689,6 +719,7 @@ class Kernel {
void Launch(const Queue &queue, const std::vector<size_t> &global,
const std::vector<size_t> &local, EventPointer event,
const std::vector<Event> &waitForEvents) {
// Builds a plain version of the events waiting list
auto waitForEventsPlain = std::vector<cl_event>();
for (auto &waitEvent : waitForEvents) {

View File

@ -0,0 +1,109 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Ivan Shapovalov <intelfx@intelfx.name>
//
// This file contains exception classes corresponding to 'clpp11.hpp'. It is also part of the
// CLCudaAPI project. See 'clpp11.hpp' for more details.
//
// =================================================================================================
#ifndef CLBLAST_CXPP11_COMMON_H_
#define CLBLAST_CXPP11_COMMON_H_
#include <string> // std::string
#include <stdexcept> // std::runtime_error
namespace clblast {
// =================================================================================================
// Basic exception class: represents an error happened inside our code
// (as opposed to an error in C++ runtime)
template <typename Base>
class Error : public Base {
public:
// Perfect forwarding of the constructor since "using Base::Base" is not supported by VS 2013
template <typename... Args>
Error(Args&&... args):
Base(std::forward<Args>(args)...) {
}
};
// =================================================================================================
// Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function)
class DeviceError : public Error<std::runtime_error> {
public:
// Perfect forwarding of the constructor since "using Error<std::runtime_error>::Error" is not
// supported by VS 2013
template <typename... Args>
DeviceError(Args&&... args):
Error<std::runtime_error>(std::forward<Args>(args)...) {
}
static std::string TrimCallString(const char *where) {
const char *paren = strchr(where, '(');
if (paren) {
return std::string(where, paren);
} else {
return std::string(where);
}
}
};
// =================================================================================================
// Represents a generic runtime error (aka environmental problem)
class RuntimeError : public Error<std::runtime_error> {
public:
explicit RuntimeError(const std::string &reason):
Error("Run-time error: " + reason) {
}
};
// =================================================================================================
// Represents a generic logic error (aka failed assertion)
class LogicError : public Error<std::logic_error> {
public:
explicit LogicError(const std::string &reason):
Error("Internal logic error: " + reason) {
}
};
// =================================================================================================
// Internal exception base class with a status field and a subclass-specific "details" field
// which can be used to recreate an exception
template <typename Base, typename Status>
class ErrorCode : public Base {
public:
ErrorCode(Status status, const std::string &details, const std::string &reason):
Base(reason),
status_(status),
details_(details) {
}
Status status() const {
return status_;
}
const std::string& details() const {
return details_;
}
private:
const Status status_;
const std::string details_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_CXPP11_COMMON_H_
#endif

View File

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "utilities.hpp"
#include "utilities/utilities.hpp"
#include "database/database.hpp"
#include "database/kernels/xaxpy.hpp"
@ -21,27 +21,42 @@
#include "database/kernels/xgemv_fast_rot.hpp"
#include "database/kernels/xger.hpp"
#include "database/kernels/xgemm.hpp"
#include "database/kernels/xgemm_direct.hpp"
#include "database/kernels/copy.hpp"
#include "database/kernels/pad.hpp"
#include "database/kernels/transpose.hpp"
#include "database/kernels/padtranspose.hpp"
#include "database/kernel_selection.hpp"
namespace clblast {
// =================================================================================================
// Initializes the database
const std::vector<Database::DatabaseEntry> Database::database = {
XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble,
XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
const std::vector<const Database::DatabaseEntry*> Database::database = {
&database::XaxpyHalf, &database::XaxpySingle, &database::XaxpyDouble, &database::XaxpyComplexSingle, &database::XaxpyComplexDouble,
&database::XdotHalf, &database::XdotSingle, &database::XdotDouble, &database::XdotComplexSingle, &database::XdotComplexDouble,
&database::XgemvHalf, &database::XgemvSingle, &database::XgemvDouble, &database::XgemvComplexSingle, &database::XgemvComplexDouble,
&database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
&database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
&database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
&database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
&database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
&database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
&database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
&database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
&database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
&database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
};
// The OpenCL device vendors
const std::string Database::kDeviceVendorAll = "default";
// Alternative names for some OpenCL vendors
const std::unordered_map<std::string, std::string> Database::kVendorNames{
{ "Intel(R) Corporation", "Intel" },
{ "GenuineIntel", "Intel" },
{ "Advanced Micro Devices, Inc.", "AMD" },
{ "NVIDIA Corporation", "NVIDIA" },
};
// =================================================================================================
@ -49,7 +64,7 @@ const std::vector<Database::DatabaseEntry> Database::database = {
// Constructor, computing device properties and populating the parameter-vector from the database.
// This takes an optional overlay database in case of custom tuning or custom kernels.
Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
const Precision precision, const std::vector<DatabaseEntry> &overlay):
const Precision precision, const std::vector<const DatabaseEntry*> &overlay):
parameters_{} {
// Finds information of the current device
@ -69,15 +84,15 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
for (auto &kernel: kernels) {
auto search_result = ParametersPtr{};
for (auto db: { &overlay, &database }) {
search_result = Search(kernel, device_type, device_vendor, device_name, precision, *db);
for (auto &db: { database, overlay}) {
search_result = Search(kernel, device_type, device_vendor, device_name, precision, db);
if (search_result) {
parameters_.insert(search_result->begin(), search_result->end());
break;
}
}
if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
}
}
@ -100,17 +115,17 @@ Database::ParametersPtr Database::Search(const std::string &this_kernel,
const std::string &this_vendor,
const std::string &this_device,
const Precision this_precision,
const std::vector<DatabaseEntry> &this_database) const {
const std::vector<const DatabaseEntry*> &this_database) const {
// Selects the right kernel
for (auto &db: this_database) {
if (db.kernel == this_kernel && db.precision == this_precision) {
if (db->kernel == this_kernel && db->precision == this_precision) {
// Searches for the right vendor and device type, or selects the default if unavailable. This
// assumes that the default vendor / device type is last in the database.
for (auto &vendor: db.vendors) {
for (auto &vendor: db->vendors) {
if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
(vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
(vendor.type == this_type || vendor.type == database::kDeviceTypeAll)) {
// Searches for the right device. If the current device is unavailable, selects the vendor
// default parameters. This assumes the default is last in the database.

View File

@ -21,11 +21,24 @@
#include <vector>
#include <unordered_map>
#include "utilities.hpp"
#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
// A special namespace to hold all the global constant variables (including the database entries)
namespace database {
// The OpenCL device types
const std::string kDeviceTypeCPU = "CPU";
const std::string kDeviceTypeGPU = "GPU";
const std::string kDeviceTypeAccelerator = "accelerator";
const std::string kDeviceTypeAll = "default";
} // namespace database
// =================================================================================================
// See comment at top of file for a description of the class
class Database {
public:
@ -36,54 +49,32 @@ class Database {
// Structures for content inside the database
struct DatabaseDevice {
const std::string name;
const Parameters parameters;
std::string name;
Parameters parameters;
};
struct DatabaseVendor {
const std::string type;
const std::string name;
const std::vector<DatabaseDevice> devices;
std::string type;
std::string name;
std::vector<DatabaseDevice> devices;
};
struct DatabaseEntry {
const std::string kernel;
const Precision precision;
const std::vector<DatabaseVendor> vendors;
std::string kernel;
Precision precision;
std::vector<DatabaseVendor> vendors;
};
// The OpenCL device types
static constexpr auto kDeviceTypeCPU = "CPU";
static constexpr auto kDeviceTypeGPU = "GPU";
static constexpr auto kDeviceTypeAccelerator = "accelerator";
static constexpr auto kDeviceTypeAll = "default";
// The OpenCL device vendors
static constexpr auto kDeviceVendorAll = "default";
static const std::string kDeviceVendorAll;
// Alternative names for some OpenCL vendors
const std::unordered_map<std::string,std::string> kVendorNames {
{"Intel(R) Corporation", "Intel"},
{"GenuineIntel", "Intel"},
{"Advanced Micro Devices, Inc.", "AMD"},
{"NVIDIA Corporation", "NVIDIA"},
};
static const std::unordered_map<std::string, std::string> kVendorNames;
// The database consists of separate database entries, stored together in a vector
static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble;
static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
static const std::vector<DatabaseEntry> database;
static const std::vector<const DatabaseEntry*> database;
// The constructor with a user-provided database overlay (potentially an empty vector)
explicit Database(const Queue &queue, const std::vector<std::string> &routines,
const Precision precision, const std::vector<DatabaseEntry> &overlay);
const Precision precision, const std::vector<const DatabaseEntry*> &overlay);
// Accessor of values by key
size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
@ -95,7 +86,8 @@ class Database {
// Search method for a specified database, returning pointer (possibly a nullptr)
ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
const std::string &this_vendor, const std::string &this_device,
const Precision this_precision, const std::vector<DatabaseEntry> &db) const;
const Precision this_precision,
const std::vector<const DatabaseEntry*> &db) const;
// Found parameters suitable for this device/kernel
Parameters parameters_;

View File

@ -0,0 +1,136 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel
// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the
// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use
// some common default values.
//
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry KernelSelectionHalf = {
"KernelSelection", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry KernelSelectionSingle = {
"KernelSelection", Precision::kSingle, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry KernelSelectionComplexSingle = {
"KernelSelection", Precision::kComplexSingle, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry KernelSelectionDouble = {
"KernelSelection", Precision::kDouble, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry KernelSelectionComplexDouble = {
"KernelSelection", Precision::kComplexDouble, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"XGEMM_MIN_INDIRECT_SIZE",1*1*1} } },
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
}
},
}
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,20 +12,21 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::CopyHalf = {
const Database::DatabaseEntry CopyHalf = {
"Copy", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
}
},
}
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::CopyHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::CopySingle = {
const Database::DatabaseEntry CopySingle = {
"Copy", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -42,6 +43,7 @@ const Database::DatabaseEntry Database::CopySingle = {
{ "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Tonga", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
}
},
@ -64,10 +66,11 @@ const Database::DatabaseEntry Database::CopySingle = {
{ "Intel(R) HD Graphics 530", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",8} } },
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Intel accelerators
@ -84,9 +87,10 @@ const Database::DatabaseEntry Database::CopySingle = {
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
@ -103,7 +107,7 @@ const Database::DatabaseEntry Database::CopySingle = {
// =================================================================================================
const Database::DatabaseEntry Database::CopyComplexSingle = {
const Database::DatabaseEntry CopyComplexSingle = {
"Copy", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -112,6 +116,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
{ "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
@ -128,7 +133,8 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Iris", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Iris Pro", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@ -147,8 +153,9 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
{ "GeForce GTX 480", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -165,7 +172,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::CopyDouble = {
const Database::DatabaseEntry CopyDouble = {
"Copy", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -174,6 +181,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
{ "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
@ -205,18 +213,19 @@ const Database::DatabaseEntry Database::CopyDouble = {
{ "GeForce GTX 670", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
}
@ -224,7 +233,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::CopyComplexDouble = {
const Database::DatabaseEntry CopyComplexDouble = {
"Copy", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -233,6 +242,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
{ "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tonga", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
@ -264,9 +274,10 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
{ "GeForce GTX 670", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 680", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -282,4 +293,5 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,14 +12,15 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::PadHalf = {
const Database::DatabaseEntry PadHalf = {
"Pad", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::PadHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::PadSingle = {
const Database::DatabaseEntry PadSingle = {
"Pad", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -42,7 +43,8 @@ const Database::DatabaseEntry Database::PadSingle = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
}
},
{ // ARM GPUs
@ -64,10 +66,11 @@ const Database::DatabaseEntry Database::PadSingle = {
{ "Intel(R) HD Graphics 530", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Iris Pro", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
}
},
{ // Intel accelerators
@ -84,9 +87,10 @@ const Database::DatabaseEntry Database::PadSingle = {
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -103,7 +107,7 @@ const Database::DatabaseEntry Database::PadSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::PadComplexSingle = {
const Database::DatabaseEntry PadComplexSingle = {
"Pad", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -112,6 +116,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
@ -134,7 +139,8 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Iris", { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
{ "Iris Pro", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
@ -154,13 +160,14 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
{ "GeForce GTX 670", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 680", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX 750", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // Default
@ -173,7 +180,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::PadDouble = {
const Database::DatabaseEntry PadDouble = {
"Pad", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -182,7 +189,8 @@ const Database::DatabaseEntry Database::PadDouble = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tonga", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@ -216,6 +224,7 @@ const Database::DatabaseEntry Database::PadDouble = {
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@ -232,7 +241,7 @@ const Database::DatabaseEntry Database::PadDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::PadComplexDouble = {
const Database::DatabaseEntry PadComplexDouble = {
"Pad", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -241,7 +250,8 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tonga", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@ -272,9 +282,10 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
{ "GeForce GTX 670", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 680", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -290,4 +301,5 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,14 +12,15 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::PadtransposeHalf = {
const Database::DatabaseEntry PadtransposeHalf = {
"Padtranspose", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
},
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::PadtransposeHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::PadtransposeSingle = {
const Database::DatabaseEntry PadtransposeSingle = {
"Padtranspose", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -42,6 +43,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
@ -64,6 +66,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
{ "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@ -87,6 +90,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
@ -103,7 +107,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
const Database::DatabaseEntry PadtransposeComplexSingle = {
"Padtranspose", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -112,6 +116,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
@ -134,6 +139,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Iris", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Iris Pro", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@ -157,6 +163,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -173,7 +180,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::PadtransposeDouble = {
const Database::DatabaseEntry PadtransposeDouble = {
"Padtranspose", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -182,6 +189,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
@ -216,6 +224,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -232,7 +241,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
const Database::DatabaseEntry PadtransposeComplexDouble = {
"Padtranspose", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -241,6 +250,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
@ -272,9 +282,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
{ "GeForce GTX 670", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 680", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -290,4 +301,5 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,20 +12,21 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::TransposeHalf = {
const Database::DatabaseEntry TransposeHalf = {
"Transpose", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
}
},
}
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::TransposeHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::TransposeSingle = {
const Database::DatabaseEntry TransposeSingle = {
"Transpose", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -42,7 +43,8 @@ const Database::DatabaseEntry Database::TransposeSingle = {
{ "Oland", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Tonga", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
{ // ARM GPUs
@ -64,10 +66,11 @@ const Database::DatabaseEntry Database::TransposeSingle = {
{ "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
}
},
{ // Intel accelerators
@ -87,6 +90,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@ -103,7 +107,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::TransposeComplexSingle = {
const Database::DatabaseEntry TransposeComplexSingle = {
"Transpose", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -112,6 +116,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
{ "Oland", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tonga", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
@ -134,7 +139,8 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Iris", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Iris Pro", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@ -151,6 +157,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@ -159,7 +166,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
}
@ -167,7 +174,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::TransposeDouble = {
const Database::DatabaseEntry TransposeDouble = {
"Transpose", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -176,6 +183,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
{ "Oland", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Tonga", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
}
},
@ -207,9 +215,10 @@ const Database::DatabaseEntry Database::TransposeDouble = {
{ "GeForce GTX 670", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 680", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "GeForce GTX 750", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -226,7 +235,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::TransposeComplexDouble = {
const Database::DatabaseEntry TransposeComplexDouble = {
"Transpose", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -235,7 +244,8 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
{ "Oland", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tonga", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
{ // ARM GPUs
@ -263,6 +273,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@ -278,4 +289,5 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,14 +12,15 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::XaxpyHalf = {
const Database::DatabaseEntry XaxpyHalf = {
"Xaxpy", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",8}, {"WGS",64}, {"WPT",1} } },
}
},
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XaxpyHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::XaxpySingle = {
const Database::DatabaseEntry XaxpySingle = {
"Xaxpy", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -42,7 +43,8 @@ const Database::DatabaseEntry Database::XaxpySingle = {
{ "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
{ "default", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
}
},
{ // ARM GPUs
@ -64,10 +66,11 @@ const Database::DatabaseEntry Database::XaxpySingle = {
{ "Intel(R) HD Graphics 530", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",1}, {"WGS",512}, {"WPT",2} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",8}, {"WGS",512}, {"WPT",1} } },
{ "Iris", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Iris Pro", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
}
},
{ // Intel accelerators
@ -84,9 +87,10 @@ const Database::DatabaseEntry Database::XaxpySingle = {
{ "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN Black", { {"VW",4}, {"WGS",128}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
@ -95,7 +99,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
}
},
}
@ -103,7 +107,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XaxpyComplexSingle = {
const Database::DatabaseEntry XaxpyComplexSingle = {
"Xaxpy", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -112,6 +116,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
{ "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
@ -134,10 +139,11 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"VW",4}, {"WGS",64}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",2}, {"WGS",512}, {"WPT",1} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
{ "Iris", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Iris Pro", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
{ "default", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
}
},
{ // Intel accelerators
@ -157,6 +163,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN Black", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@ -173,7 +180,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XaxpyDouble = {
const Database::DatabaseEntry XaxpyDouble = {
"Xaxpy", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -182,6 +189,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
{ "Oland", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",128}, {"WPT",4} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
@ -213,18 +221,19 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
{ "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN Black", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
}
},
}
@ -232,7 +241,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::XaxpyComplexDouble = {
const Database::DatabaseEntry XaxpyComplexDouble = {
"Xaxpy", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -241,6 +250,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
{ "Oland", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
@ -272,9 +282,10 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
{ "GeForce GTX 670", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 750", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
{ "GeForce GTX TITAN Black", { {"VW",1}, {"WGS",128}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@ -290,4 +301,5 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,20 +12,21 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::XdotHalf = {
const Database::DatabaseEntry XdotHalf = {
"Xdot", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",32}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
}
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XdotHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::XdotSingle = {
const Database::DatabaseEntry XdotSingle = {
"Xdot", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -41,6 +42,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",128}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
@ -55,7 +57,8 @@ const Database::DatabaseEntry Database::XdotSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WGS2",32} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",32} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",512}, {"WGS2",128} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
{ "Iris Pro", { {"WGS1",512}, {"WGS2",64} } },
{ "default", { {"WGS1",64}, {"WGS2",32} } },
}
@ -68,7 +71,9 @@ const Database::DatabaseEntry Database::XdotSingle = {
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",1024} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",128} } },
{ "GeForce GTX 750", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX TITAN Black", { {"WGS1",512}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",256} } },
@ -84,7 +89,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XdotComplexSingle = {
const Database::DatabaseEntry XdotComplexSingle = {
"Xdot", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -92,7 +97,8 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
{ "Oland", { {"WGS1",128}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",256}, {"WGS2",64} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
{ // Intel CPUs
@ -106,7 +112,8 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",512}, {"WGS2",32} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",256} } },
{ "Iris Pro", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",32}, {"WGS2",32} } },
}
@ -119,7 +126,9 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",512}, {"WGS2",64} } },
@ -127,7 +136,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",256}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
}
@ -135,7 +144,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XdotDouble = {
const Database::DatabaseEntry XdotDouble = {
"Xdot", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -143,7 +152,8 @@ const Database::DatabaseEntry Database::XdotDouble = {
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",128}, {"WGS2",64} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },
}
},
{ // Intel CPUs
@ -160,10 +170,12 @@ const Database::DatabaseEntry Database::XdotDouble = {
{ "GeForce GTX 670", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",256} } },
{ "GeForce GTX 750 Ti", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },
}
},
{ // Default
@ -176,7 +188,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::XdotComplexDouble = {
const Database::DatabaseEntry XdotComplexDouble = {
"Xdot", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -184,6 +196,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",128}, {"WGS2",64} } },
{ "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
@ -201,7 +214,9 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
{ "GeForce GTX 670", { {"WGS1",512}, {"WGS2",128} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },
@ -216,4 +231,5 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,13 +12,20 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::XgemmHalf = {
const Database::DatabaseEntry XgemmHalf = {
"Xgemm", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
}
@ -26,7 +33,7 @@ const Database::DatabaseEntry Database::XgemmHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemmSingle = {
const Database::DatabaseEntry XgemmSingle = {
"Xgemm", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -35,7 +42,8 @@ const Database::DatabaseEntry Database::XgemmSingle = {
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
}
},
{ // ARM GPUs
@ -57,10 +65,11 @@ const Database::DatabaseEntry Database::XgemmSingle = {
{ "Intel(R) HD Graphics 530", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Iris", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
{ "Iris Pro", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
{ // Intel accelerators
@ -77,18 +86,19 @@ const Database::DatabaseEntry Database::XgemmSingle = {
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "GeForce GTX 750", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
}
@ -96,7 +106,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemmComplexSingle = {
const Database::DatabaseEntry XgemmComplexSingle = {
"Xgemm", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -105,7 +115,8 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
{ "Oland", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
}
},
{ // ARM GPUs
@ -127,10 +138,11 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "Iris", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Iris Pro", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
}
},
{ // Intel accelerators
@ -147,18 +159,19 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
{ "GeForce GTX 670", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 750", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
}
},
}
@ -166,7 +179,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemmDouble = {
const Database::DatabaseEntry XgemmDouble = {
"Xgemm", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -175,7 +188,8 @@ const Database::DatabaseEntry Database::XgemmDouble = {
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tonga", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
}
},
{ // ARM GPUs
@ -189,7 +203,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
{ "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
{ "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "default", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
}
},
{ // Intel accelerators
@ -206,18 +220,19 @@ const Database::DatabaseEntry Database::XgemmDouble = {
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 680", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX 750", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
}
},
}
@ -225,7 +240,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemmComplexDouble = {
const Database::DatabaseEntry XgemmComplexDouble = {
"Xgemm", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -234,7 +249,8 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
}
},
{ // ARM GPUs
@ -265,21 +281,23 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
{ "GeForce GTX 670", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 680", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 750", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
}
},
}
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -0,0 +1,154 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Database generator <database.py>
//
// This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels.
//
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry XgemmDirectHalf = {
"XgemmDirect", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XgemmDirectSingle = {
"XgemmDirect", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "Tonga", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
{ "Iris Pro", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XgemmDirectComplexSingle = {
"XgemmDirect", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
}
},
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Iris Pro", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XgemmDirectDouble = {
"XgemmDirect", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry XgemmDirectComplexDouble = {
"XgemmDirect", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
}
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,14 +12,15 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvHalf = {
const Database::DatabaseEntry XgemvHalf = {
"Xgemv", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",256}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgemvHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvSingle = {
const Database::DatabaseEntry XgemvSingle = {
"Xgemv", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -42,6 +43,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
{ "Oland", { {"WGS1",128}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",128}, {"WPT1",2} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
@ -57,10 +59,11 @@ const Database::DatabaseEntry Database::XgemvSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WPT1",1} } },
{ "Iris", { {"WGS1",64}, {"WPT1",2} } },
{ "Iris Pro", { {"WGS1",256}, {"WPT1",2} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
{ // Intel accelerators
@ -77,9 +80,10 @@ const Database::DatabaseEntry Database::XgemvSingle = {
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN Black", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } },
{ "Tesla K20m", { {"WGS1",128}, {"WPT1",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } },
@ -88,7 +92,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
}
@ -96,7 +100,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvComplexSingle = {
const Database::DatabaseEntry XgemvComplexSingle = {
"Xgemv", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -105,6 +109,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
{ "Oland", { {"WGS1",64}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",64}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",64}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
@ -120,6 +125,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WPT1",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WPT1",1} } },
{ "Iris", { {"WGS1",256}, {"WPT1",1} } },
{ "Iris Pro", { {"WGS1",64}, {"WPT1",1} } },
@ -140,8 +146,9 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
{ "GeForce GTX 670", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
@ -155,7 +162,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvDouble = {
const Database::DatabaseEntry XgemvDouble = {
"Xgemv", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -164,6 +171,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
{ "Oland", { {"WGS1",256}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",256}, {"WPT1",1} } },
}
},
@ -188,9 +196,10 @@ const Database::DatabaseEntry Database::XgemvDouble = {
{ "GeForce GTX 670", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } },
{ "Tesla K20m", { {"WGS1",256}, {"WPT1",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } },
@ -207,7 +216,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvComplexDouble = {
const Database::DatabaseEntry XgemvComplexDouble = {
"Xgemv", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -216,6 +225,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
{ "Oland", { {"WGS1",256}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
@ -249,4 +259,5 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,14 +12,15 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastHalf = {
const Database::DatabaseEntry XgemvFastHalf = {
"XgemvFast", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
}
},
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgemvFastHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastSingle = {
const Database::DatabaseEntry XgemvFastSingle = {
"XgemvFast", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -42,6 +43,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tonga", { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@ -57,10 +59,11 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
{ "Intel(R) HD Graphics 530", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "Iris", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "Iris Pro", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
{ "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
{ "default", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
}
},
{ // Intel accelerators
@ -77,9 +80,10 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
{ "GeForce GTX 670", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 750", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 750 Ti", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN Black", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tesla K20m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -96,7 +100,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
const Database::DatabaseEntry XgemvFastComplexSingle = {
"XgemvFast", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -105,6 +109,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tonga", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@ -120,7 +125,8 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
{ "Iris", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Iris Pro", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@ -139,7 +145,6 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
{ "GeForce GTX 480", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@ -153,7 +158,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastDouble = {
const Database::DatabaseEntry XgemvFastDouble = {
"XgemvFast", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -162,6 +167,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tonga", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@ -186,9 +192,10 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
{ "GeForce GTX 670", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 680", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "GeForce GTX 750", { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
{ "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN Black", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tesla K20m", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -205,7 +212,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
const Database::DatabaseEntry XgemvFastComplexDouble = {
"XgemvFast", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -214,6 +221,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
{ "Oland", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tonga", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@ -247,4 +255,5 @@ const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,13 +12,20 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastRotHalf = {
const Database::DatabaseEntry XgemvFastRotHalf = {
"XgemvFastRot", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
}
},
}
@ -26,12 +33,13 @@ const Database::DatabaseEntry Database::XgemvFastRotHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastRotSingle = {
const Database::DatabaseEntry XgemvFastRotSingle = {
"XgemvFastRot", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "Tonga", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
}
},
{ // Intel CPUs
@ -44,20 +52,23 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
{ "Iris Pro", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "default", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "GeForce GTX TITAN Black", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
}
},
}
@ -65,12 +76,13 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
const Database::DatabaseEntry XgemvFastRotComplexSingle = {
"XgemvFastRot", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "Tonga", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Intel CPUs
@ -83,14 +95,15 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
{ "Iris Pro", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
}
},
}
@ -98,11 +111,12 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastRotDouble = {
const Database::DatabaseEntry XgemvFastRotDouble = {
"XgemvFastRot", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "Tonga", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
}
},
@ -114,8 +128,10 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "GeForce GTX TITAN Black", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
}
},
{ // Default
@ -128,12 +144,13 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
const Database::DatabaseEntry XgemvFastRotComplexDouble = {
"XgemvFastRot", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "Tonga", { {"VW3",4}, {"WGS3",16}, {"WPT3",8} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
}
},
{ // Intel CPUs
@ -151,4 +168,5 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -12,20 +12,21 @@
// =================================================================================================
namespace clblast {
namespace database {
// =================================================================================================
const Database::DatabaseEntry Database::XgerHalf = {
const Database::DatabaseEntry XgerHalf = {
"Xger", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",4}, {"WGS2",8}, {"WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",4}, {"WGS2",8}, {"WPT",2} } },
}
},
}
@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgerHalf = {
// =================================================================================================
const Database::DatabaseEntry Database::XgerSingle = {
const Database::DatabaseEntry XgerSingle = {
"Xger", Precision::kSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -42,7 +43,8 @@ const Database::DatabaseEntry Database::XgerSingle = {
{ "Oland", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "Tonga", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -63,7 +65,8 @@ const Database::DatabaseEntry Database::XgerSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",4}, {"WPT",4} } },
{ "Iris Pro", { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
}
@ -76,8 +79,10 @@ const Database::DatabaseEntry Database::XgerSingle = {
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
}
},
{ // Default
@ -90,7 +95,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XgerComplexSingle = {
const Database::DatabaseEntry XgerComplexSingle = {
"Xger", Precision::kComplexSingle, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -99,7 +104,8 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
{ "Oland", { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "Tonga", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -120,9 +126,10 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
{ "Intel(R) HD Graphics 530", { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
{ "Intel(R) HD Graphics IvyBridge M GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
{ "Iris Pro", { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
}
},
{ // NVIDIA GPUs
@ -133,13 +140,15 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
{ "GeForce GTX 670", { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 750", { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
},
}
@ -147,7 +156,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
// =================================================================================================
const Database::DatabaseEntry Database::XgerDouble = {
const Database::DatabaseEntry XgerDouble = {
"Xger", Precision::kDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -156,7 +165,8 @@ const Database::DatabaseEntry Database::XgerDouble = {
{ "Oland", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "Tonga", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -180,8 +190,10 @@ const Database::DatabaseEntry Database::XgerDouble = {
{ "GeForce GTX 670", { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
{ "default", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
},
{ // Default
@ -194,7 +206,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
// =================================================================================================
const Database::DatabaseEntry Database::XgerComplexDouble = {
const Database::DatabaseEntry XgerComplexDouble = {
"Xger", Precision::kComplexDouble, {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
@ -203,6 +215,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
{ "Oland", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "Tonga", { {"WGS1",16}, {"WGS2",4}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
}
},
@ -227,7 +240,9 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
{ "GeForce GTX 670", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
{ "GeForce GTX 680", { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX 750", { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
}
},
@ -240,4 +255,5 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
};
// =================================================================================================
} // namespace database
} // namespace clblast

View File

@ -204,7 +204,7 @@ R"(
#if PRECISION == 3232 || PRECISION == 6464
#define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y
#else
#define COMPLEX_CONJUGATE(value) value = value
#define COMPLEX_CONJUGATE(value)
#endif
// =================================================================================================

View File

@ -0,0 +1,273 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any
// pre and and post-processing kernels.
//
// This kernel is seperated into three files. This is part 1 out of 3.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library. Note that all parameters here have a
// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel.
#ifndef WGD
#define WGD 8 // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64)
#endif
#ifndef MDIMCD
#define MDIMCD 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
#endif
#ifndef NDIMCD
#define NDIMCD 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
#endif
#ifndef MDIMAD
#define MDIMAD 8 // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
#endif
#ifndef NDIMBD
#define NDIMBD 8 // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
#endif
#ifndef KWID
#define KWID 1 // Unroll factor of the WGD loop (smaller or equal than WGD)
#endif
#ifndef VWMD
#define VWMD 1 // Vector width of matrices A and C
#endif
#ifndef VWND
#define VWND 1 // Vector width of matrix B
#endif
#ifndef PADA
#define PADA 1 // Local memory padding for matrix A
#endif
#ifndef PADB
#define PADB 1 // Local memory padding for matrix B
#endif
// Helper parameters based on the above tuning parameters
#define MWID (WGD/MDIMCD) // Work per work-item (M-dimension)
#define NWID (WGD/NDIMCD) // Work per work-item (N-dimension)
#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
#define MWAD (WGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension)
#define KWAD (WGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension)
#define KWBD (WGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension)
#define NWBD (WGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension)
// =================================================================================================
// Data-widths in dimension M
#if VWMD == 1
typedef real realMD;
#elif VWMD == 2
typedef real2 realMD;
#elif VWMD == 4
typedef real4 realMD;
#elif VWMD == 8
typedef real8 realMD;
#elif VWMD == 16
typedef real16 realMD;
#endif
// Data-widths in dimension N
#if VWND == 1
typedef real realND;
#elif VWND == 2
typedef real2 realND;
#elif VWND == 4
typedef real4 realND;
#elif VWND == 8
typedef real8 realND;
#elif VWND == 16
typedef real16 realND;
#endif
// =================================================================================================
// Initializes the accumulation registers to zero
inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
#pragma unroll
for (int mi=0; mi<MWID; ++mi) {
#pragma unroll
for (int ni=0; ni<NWID; ++ni) {
SetToZero(cpm[ni][mi]);
}
}
}
// =================================================================================================
// Performs the actual computation: Cpm += Apm * Bpm
inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
#pragma unroll
for (int ni=0; ni<NWID; ++ni) {
#pragma unroll
for (int mi=0; mi<MWID; ++mi) {
MultiplyAdd(cpm[ni][mi], apm[mi], bpm[ni]);
}
}
}
// =================================================================================================
// Loads global off-chip memory into thread-private register files. This function is specific for
// loading the A input matrix.
inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
const int a_ld, const int a_offset, const int idm, const int idk,
const int a_transpose, const int a_conjugate) {
#pragma unroll
for (int mi=0; mi<MWID; ++mi) {
const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
apm[mi] = agms[a_index + a_offset];
if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
}
}
// Same as above, but now for the B input matrix
inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
const int b_ld, const int b_offset, const int idn, const int idk,
const int b_transpose, const int b_conjugate) {
#pragma unroll
for (int ni=0; ni<NWID; ++ni) {
const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
bpm[ni] = bgms[b_index + b_offset];
if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
}
}
// Loads global off-chip memory into thread-private register files. This function is specific for
// loading the A input matrix. This is the same as above but now includes a bounds check.
inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
const int a_ld, const int a_offset, const int idm, const int idk,
const int a_transpose, const int a_conjugate,
const int kSizeM) {
#pragma unroll
for (int mi=0; mi<MWID; ++mi) {
if (idm + mi < kSizeM) {
const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
apm[mi] = agms[a_index + a_offset];
if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
}
else {
SetToZero(apm[mi]);
}
}
}
// Same as above, but now for the B input matrix
inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
const int b_ld, const int b_offset, const int idn, const int idk,
const int b_transpose, const int b_conjugate,
const int kSizeN) {
#pragma unroll
for (int ni=0; ni<NWID; ++ni) {
if (idn + ni < kSizeN) {
const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
bpm[ni] = bgms[b_index + b_offset];
if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
}
else {
SetToZero(bpm[ni]);
}
}
}
// =================================================================================================
// Caches on-chip local memory into per-thread private memory (registers). This function is specific
// for caching the A input matrix.
inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
const int a_transpose) {
#pragma unroll
for (int mi=0; mi<MWID; ++mi) {
const int mg = mi + get_local_id(0)*MWID;
const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg;
apm[mi] = alm[index];
}
}
// Same as above, but now for the B input matrix
inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
const int b_transpose) {
#pragma unroll
for (int ni=0; ni<NWID; ++ni) {
const int ng = ni + get_local_id(1)*NWID;
const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng;
bpm[ni] = blm[index];
}
}
// =================================================================================================
// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
const int idm, const int idn,
const real alpha, const real beta,
const int c_ld, const int c_offset, const int c_transpose) {
#pragma unroll
for (int ni=0; ni<NWID; ++ni) {
#pragma unroll
for (int mi=0; mi<MWID; ++mi) {
// Determines the destination index
int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
// The final multiplication with alpha (in case beta == 0)
real result;
if (IsZero(beta)) {
Multiply(result, alpha, cpm[ni][mi]);
}
// The final multiplication with alpha and the addition with beta*C
else {
AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
}
cgm[c_index + c_offset] = result;
}
}
}
// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
const int idm, const int idn, const int kSizeM, const int kSizeN,
const real alpha, const real beta,
const int c_ld, const int c_offset, const int c_transpose) {
#pragma unroll
for (int ni=0; ni<NWID; ++ni) {
#pragma unroll
for (int mi=0; mi<MWID; ++mi) {
if ((idm + mi) < kSizeM && (idn + ni) < kSizeN) {
// Determines the destination index
int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
// The final multiplication with alpha (in case beta == 0)
real result;
if (IsZero(beta)) {
Multiply(result, alpha, cpm[ni][mi]);
}
// The final multiplication with alpha and the addition with beta*C
else {
AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
}
cgm[c_index + c_offset] = result;
}
}
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View File

@ -0,0 +1,314 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix.
inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
const int a_ld, const int a_offset, const int kwg,
const int a_transpose, const int a_conjugate) {
#if MDIMCD == MDIMAD
const int la0 = get_local_id(0);
const int la1 = get_local_id(1);
#else
const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
const int la0 = tid % MDIMAD;
const int la1 = tid / MDIMAD;
#endif
#pragma unroll
for (int mia=0; mia<MWAD/VWMD; ++mia) {
#pragma unroll
for (int kia=0; kia<KWAD; ++kia) {
// Computes the indices for the global memory
int mg = mia + la0*(MWAD/VWMD);
int kg = kia + la1*KWAD;
int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD);
int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
#if VWMD == 1
alm[kg*(WGD + PADA) + mg] = avec;
#elif VWMD == 2
alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
#elif VWMD == 4
alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.z;
alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.w;
#elif VWMD == 8
alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
#elif VWMD == 16
alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
alm[kg*(WGD + PADA) + mg*VWMD + 8] = avec.s8;
alm[kg*(WGD + PADA) + mg*VWMD + 9] = avec.s9;
alm[kg*(WGD + PADA) + mg*VWMD + 10] = avec.sA;
alm[kg*(WGD + PADA) + mg*VWMD + 11] = avec.sB;
alm[kg*(WGD + PADA) + mg*VWMD + 12] = avec.sC;
alm[kg*(WGD + PADA) + mg*VWMD + 13] = avec.sD;
alm[kg*(WGD + PADA) + mg*VWMD + 14] = avec.sE;
alm[kg*(WGD + PADA) + mg*VWMD + 15] = avec.sF;
#endif
if (a_conjugate) {
for (int vm=0; vm<VWMD; ++vm) {
COMPLEX_CONJUGATE(alm[kg*(WGD + PADA) + mg*VWMD + vm]);
}
}
}
}
}
// Same as above, but now for the B input matrix
inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
const int b_ld, const int b_offset, const int kwg,
const int b_transpose, const int b_conjugate) {
#if MDIMCD == NDIMBD
const int lb0 = get_local_id(0);
const int lb1 = get_local_id(1);
#else
const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
const int lb0 = tid % NDIMBD;
const int lb1 = tid / NDIMBD;
#endif
#pragma unroll
for (int kib=0; kib<KWBD; ++kib) {
#pragma unroll
for (int nib=0; nib<NWBD/VWND; ++nib) {
// Computes the indices for the global memory
int ng = nib + lb0*(NWBD/VWND);
int kg = kib + lb1*KWBD;
int idn = (b_transpose) ? ng + kwg/VWND : ng + GetGroupID1()*(WGD/VWND);
int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
#if VWND == 1
blm[kg*(WGD + PADB) + ng] = bvec;
#elif VWND == 2
blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
#elif VWND == 4
blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.z;
blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.w;
#elif VWND == 8
blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
#elif VWND == 16
blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
blm[kg*(WGD + PADB) + ng*VWND + 8] = bvec.s8;
blm[kg*(WGD + PADB) + ng*VWND + 9] = bvec.s9;
blm[kg*(WGD + PADB) + ng*VWND + 10] = bvec.sA;
blm[kg*(WGD + PADB) + ng*VWND + 11] = bvec.sB;
blm[kg*(WGD + PADB) + ng*VWND + 12] = bvec.sC;
blm[kg*(WGD + PADB) + ng*VWND + 13] = bvec.sD;
blm[kg*(WGD + PADB) + ng*VWND + 14] = bvec.sE;
blm[kg*(WGD + PADB) + ng*VWND + 15] = bvec.sF;
#endif
if (b_conjugate) {
for (int vn=0; vn<VWND; ++vn) {
COMPLEX_CONJUGATE(blm[kg*(WGD + PADB) + ng*VWND + vn]);
}
}
}
}
}
// =================================================================================================
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix. In contrast to the functions above, this function performs doesn't
// use the vector data-types.
inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
const int a_ld, const int a_offset, const int kwg,
const int a_transpose, const int a_conjugate) {
#if MDIMCD == MDIMAD
const int la0 = get_local_id(0);
const int la1 = get_local_id(1);
#else
const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
const int la0 = tid % MDIMAD;
const int la1 = tid / MDIMAD;
#endif
#pragma unroll
for (int mia=0; mia<MWAD; ++mia) {
#pragma unroll
for (int kia=0; kia<KWAD; ++kia) {
// Computes the indices for the global memory
int mg = mia + la0*MWAD;
int kg = kia + la1*KWAD;
int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
real result = agms[idk*a_ld + idm + a_offset];
if (a_conjugate) { COMPLEX_CONJUGATE(result); }
alm[kg*(WGD + PADA) + mg] = result;
}
}
}
// Same as above, but now for the B input matrix
inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
const int b_ld, const int b_offset, const int kwg,
const int b_transpose, const int b_conjugate) {
#if MDIMCD == NDIMBD
const int lb0 = get_local_id(0);
const int lb1 = get_local_id(1);
#else
const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
const int lb0 = tid % NDIMBD;
const int lb1 = tid / NDIMBD;
#endif
#pragma unroll
for (int kib=0; kib<KWBD; ++kib) {
#pragma unroll
for (int nib=0; nib<NWBD; ++nib) {
// Computes the indices for the global memory
int ng = nib + lb0*NWBD;
int kg = kib + lb1*KWBD;
int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
real result = bgms[idk*b_ld + idn + b_offset];
if (b_conjugate) { COMPLEX_CONJUGATE(result); }
blm[kg*(WGD + PADB) + ng] = result;
}
}
}
// =================================================================================================
// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
// caching the A input matrix. In contrast to the functions above, this function performs bounds
// checks and doesn't use the vector data-types.
inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
const int a_ld, const int a_offset, const int kwg,
const int a_transpose, const int a_conjugate,
const int kSizeM, const int kSizeK) {
#if MDIMCD == MDIMAD
const int la0 = get_local_id(0);
const int la1 = get_local_id(1);
#else
const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
const int la0 = tid % MDIMAD;
const int la1 = tid / MDIMAD;
#endif
#pragma unroll
for (int mia=0; mia<MWAD; ++mia) {
#pragma unroll
for (int kia=0; kia<KWAD; ++kia) {
// Computes the indices for the global memory
int mg = mia + la0*MWAD;
int kg = kia + la1*KWAD;
int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
int condition = (a_transpose) ? idm < kSizeK : idm < kSizeM;
if (condition) {
real result = agms[idk*a_ld + idm + a_offset];
if (a_conjugate) { COMPLEX_CONJUGATE(result); }
alm[kg*(WGD + PADA) + mg] = result;
}
else {
SetToZero(alm[kg*(WGD + PADA) + mg]);
}
}
}
}
// Same as above, but now for the B input matrix
inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
const int b_ld, const int b_offset, const int kwg,
const int b_transpose, const int b_conjugate,
const int kSizeN, const int kSizeK) {
#if MDIMCD == NDIMBD
const int lb0 = get_local_id(0);
const int lb1 = get_local_id(1);
#else
const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
const int lb0 = tid % NDIMBD;
const int lb1 = tid / NDIMBD;
#endif
#pragma unroll
for (int kib=0; kib<KWBD; ++kib) {
#pragma unroll
for (int nib=0; nib<NWBD; ++nib) {
// Computes the indices for the global memory
int ng = nib + lb0*NWBD;
int kg = kib + lb1*KWBD;
int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
int condition = (b_transpose) ? idn < kSizeK : idn < kSizeN;
if (condition) {
real result = bgms[idk*b_ld + idn + b_offset];
if (b_conjugate) { COMPLEX_CONJUGATE(result); }
blm[kg*(WGD + PADB) + ng] = result;
}
else {
SetToZero(blm[kg*(WGD + PADB) + ng]);
}
}
}
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View File

@ -0,0 +1,214 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
//
// =================================================================================================
// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(
// =================================================================================================
// Main body of the kernel. This is the direct version without pre/post processing and restrictions.
inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha,
const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
const __global realND* restrict bgm, const int b_offset, const int b_ld,
__global real* cgm, const int c_offset, const int c_ld,
__local real* alm, __local real* blm,
const int a_transpose, const int b_transpose, const int c_transpose,
const int a_conjugate, const int b_conjugate) {
const real alpha = GetRealArg(arg_alpha);
const real beta = GetRealArg(arg_beta);
// Extra pointers to scalar versions of global memory
const __global real* restrict agms = (const __global real* restrict) agm;
const __global real* restrict bgms = (const __global real* restrict) bgm;
// Allocates workitem-private memory (registers)
real apm[MWID];
real bpm[NWID];
real cpm[NWID][MWID];
// Initializes the accumulation registers
InitAccRegistersDirect(cpm);
// The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section
// processes only the main parts: output blocks of WGD by WGD.
const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD;
const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD;
if ((idm < (kSizeM/WGD)*WGD) && (idn < (kSizeN/WGD)*WGD)) {
// Loops over all complete workgroup tiles (K-dimension)
int kwg = 0;
for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
// Loads data: off-chip --> local (matrix A and B)
if (a_ld % VWMD == 0) {
GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
}
else {
GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
}
if (b_ld % VWND == 0) {
GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
}
else {
GlobalToLocalScalarB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
}
barrier(CLK_LOCAL_MEM_FENCE);
// Loops over all workitem tiles, unrolled by a factor KWID
for (int pwi=0; pwi<WGD; pwi+=KWID) {
#pragma unroll
for (int pit=0; pit<KWID; ++pit) {
int kg = pwi + pit;
// Loads data: local --> private (matrix A and B)
LocalToPrivateDirectA(alm, apm, kg, a_transpose);
LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
// Performs the accumulation (Cpm += Apm * Bpm)
MultiplyAccumulateDirect(cpm, apm, bpm);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Loop over the remaining part (incomplete tile in K-dimension)
for (; kwg < kSizeK; ++kwg) {
// Loads data: off-chip --> private (matrix A and B)
GlobalToPrivateDirectA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate);
GlobalToPrivateDirectB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate);
// Performs the accumulation (Cpm += Apm * Bpm)
MultiplyAccumulateDirect(cpm, apm, bpm);
}
// Stores a tile of results and performs the multiplication with alpha and beta
StoreResultsDirect(cgm, cpm, idm, idn, alpha, beta, c_ld, c_offset, c_transpose);
}
// Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions)
else {
// Loops over all complete workgroup tiles (K-dimension)
int kwg = 0;
for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
// Loads data: off-chip --> local (matrix A and B)
GlobalToLocalCheckedA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate, kSizeM, kSizeK);
GlobalToLocalCheckedB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate, kSizeN, kSizeK);
barrier(CLK_LOCAL_MEM_FENCE);
// Loops over all workitem tiles, unrolled by a factor KWID
for (int pwi=0; pwi<WGD; pwi+=KWID) {
#pragma unroll
for (int pit=0; pit<KWID; ++pit) {
int kg = pwi + pit;
// Loads data: local --> private (matrix A and B)
LocalToPrivateDirectA(alm, apm, kg, a_transpose);
LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
// Performs the accumulation (Cpm += Apm * Bpm)
MultiplyAccumulateDirect(cpm, apm, bpm);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// Loop over the remaining part (incomplete tile in K-dimension)
for (; kwg < kSizeK; ++kwg) {
// Loads data: off-chip --> private (matrix A and B)
GlobalToPrivateCheckedA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM);
GlobalToPrivateCheckedB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN);
// Performs the accumulation (Cpm += Apm * Bpm)
MultiplyAccumulateDirect(cpm, apm, bpm);
}
// Stores a tile of results and performs the multiplication with alpha and beta
StoreResultsChecked(cgm, cpm, idm, idn, kSizeM, kSizeN, alpha, beta, c_ld, c_offset, c_transpose);
}
}
// =================================================================================================
// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
const __global realND* restrict bgm, const int b_offset, const int b_ld,
__global real* cgm, const int c_offset, const int c_ld,
const int c_transpose, const int a_conjugate, const int b_conjugate) {
__local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)];
XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
}
// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
const __global realND* restrict bgm, const int b_offset, const int b_ld,
__global real* cgm, const int c_offset, const int c_ld,
const int c_transpose, const int a_conjugate, const int b_conjugate) {
__local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)];
XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
}
// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
const __global realND* restrict bgm, const int b_offset, const int b_ld,
__global real* cgm, const int c_offset, const int c_ld,
const int c_transpose, const int a_conjugate, const int b_conjugate) {
__local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)];
XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
}
// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
const real_arg arg_alpha, const real_arg arg_beta,
const __global realMD* restrict agm, const int a_offset, const int a_ld,
const __global realND* restrict bgm, const int b_offset, const int b_ld,
__global real* cgm, const int c_offset, const int c_ld,
const int c_transpose, const int a_conjugate, const int b_conjugate) {
__local real alm[WGD * (WGD + PADA)];
__local real blm[WGD * (WGD + PADB)];
XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
}
// =================================================================================================
// End of the C++11 raw string literal
)"
// =================================================================================================

View File

@ -113,7 +113,7 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
const real beta = GetRealArg(arg_beta);
// Skip these threads if they do not contain threads contributing to the upper-triangle
if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
if ((GetGroupID1() + 1)*NWG < GetGroupID0()*MWG) {
return;
}
@ -153,7 +153,7 @@ void XgemmLower(const int kSizeN, const int kSizeK,
const real beta = GetRealArg(arg_beta);
// Skip these threads if they do not contain threads contributing to the lower-triangle
if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) {
return;
}

View File

@ -14,16 +14,18 @@
#include <string>
#include <vector>
#include <chrono>
#include <cstdlib>
#include "routine.hpp"
namespace clblast {
// =================================================================================================
// Constructor: not much here, because no status codes can be returned
// The constructor does all heavy work, errors are returned as exceptions
Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
const std::vector<Database::DatabaseEntry> &userDatabase):
const std::vector<const Database::DatabaseEntry*> &userDatabase,
std::initializer_list<const char *> source):
precision_(precision),
routine_name_(name),
queue_(queue),
@ -32,27 +34,24 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
device_(queue_.GetDevice()),
device_name_(device_.Name()),
db_(queue_, routines, precision_, userDatabase) {
}
// =================================================================================================
// Separate set-up function to allow for status codes to be returned
StatusCode Routine::SetUp() {
// Queries the cache to see whether or not the program (context-specific) is already there
if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }
// Sets the build options from an environmental variable (if set)
auto options = std::vector<std::string>();
const auto environment_variable = std::getenv("CLBLAST_BUILD_OPTIONS");
if (environment_variable != nullptr) {
options.push_back(std::string(environment_variable));
}
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
try {
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
auto options = std::vector<std::string>();
program.Build(device_, options);
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
return StatusCode::kSuccess;
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
program.Build(device_, options);
StoreProgramToCache(program, context_, precision_, routine_name_);
}
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
@ -62,48 +61,50 @@ StatusCode Routine::SetUp() {
const auto extensions = device_.Capabilities();
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
return StatusCode::kNoDoublePrecision;
throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
}
}
// As above, but for cl_khr_fp16 (half precision)
if (precision_ == Precision::kHalf) {
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
return StatusCode::kNoHalfPrecision;
throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
}
}
// Loads the common header (typedefs and defines and such)
std::string common_header =
#include "kernels/common.opencl"
;
// Collects the parameters for this device in the form of defines, and adds the precision
auto defines = db_.GetDefines();
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
auto source_string = db_.GetDefines();
source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
// Adds the name of the routine as a define
defines += "#define ROUTINE_"+routine_name_+"\n";
source_string += "#define ROUTINE_"+routine_name_+"\n";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.IsAMD() && device_.IsGPU()) {
defines += "#define USE_CL_MAD 1\n";
source_string += "#define USE_CL_MAD 1\n";
}
// For specific devices, use staggered/shuffled workgroup indices.
if (device_.IsAMD() && device_.IsGPU()) {
defines += "#define USE_STAGGERED_INDICES 1\n";
source_string += "#define USE_STAGGERED_INDICES 1\n";
}
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if (device_.IsARM() && device_.IsGPU()) {
defines += "#define GLOBAL_MEM_FENCE 1\n";
source_string += "#define GLOBAL_MEM_FENCE 1\n";
}
// Combines everything together into a single source string
const auto source_string = defines + common_header + source_string_;
// Loads the common header (typedefs and defines and such)
source_string +=
#include "kernels/common.opencl"
;
// Adds routine-specific code to the constructed source string
for (const char *s: source) {
source_string += s;
}
// Prints details of the routine to compile in case of debugging in verbose mode
#ifdef VERBOSE
@ -113,24 +114,21 @@ StatusCode Routine::SetUp() {
#endif
// Compiles the kernel
auto program = Program(context_, source_string);
try {
auto program = Program(context_, source_string);
auto options = std::vector<std::string>();
const auto build_status = program.Build(device_, options);
// Checks for compiler crashes/errors/warnings
if (build_status == BuildStatus::kError) {
const auto message = program.GetBuildInfo(device_);
fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
return StatusCode::kBuildProgramFailure;
program.Build(device_, options);
} catch (const CLError &e) {
if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
fprintf(stdout, "OpenCL compiler error/warning: %s\n",
program.GetBuildInfo(device_).c_str());
}
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
throw;
}
// Store the compiled binary and program in the cache
const auto binary = program.GetIR();
StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
// Store the compiled binary and program in the cache
const auto binary = program.GetIR();
StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
StoreProgramToCache(program, context_, precision_, routine_name_);
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
@ -138,9 +136,6 @@ StatusCode Routine::SetUp() {
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
#endif
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// =================================================================================================

View File

@ -19,9 +19,9 @@
#include <string>
#include <vector>
#include "utilities.hpp"
#include "utilities/utilities.hpp"
#include "cache.hpp"
#include "buffer_test.hpp"
#include "utilities/buffer_test.hpp"
#include "database/database.hpp"
#include "routines/common.hpp"
@ -34,21 +34,19 @@ class Routine {
// Base class constructor. The user database is an optional extra database to override the
// built-in database.
// All heavy preparation work is done inside this constructor.
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
const std::vector<Database::DatabaseEntry> &userDatabase = {});
// Set-up phase of the kernel
StatusCode SetUp();
const std::vector<const Database::DatabaseEntry*> &userDatabase,
std::initializer_list<const char *> source);
protected:
// Non-static variable for the precision
const Precision precision_;
// The routine's name and its kernel-source in string form
// The routine's name
const std::string routine_name_;
std::string source_string_;
// The OpenCL objects, accessible only from derived classes
Queue queue_;

View File

@ -20,22 +20,26 @@ namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents) {
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents) {
if (!local.empty()) {
// Tests for validity of the local thread sizes
if (local.size() > device.MaxWorkItemDimensions()) {
return StatusCode::kInvalidLocalNumDimensions;
throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
}
const auto max_work_item_sizes = device.MaxWorkItemSizes();
for (auto i=size_t{0}; i<local.size(); ++i) {
if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
if (local[i] > max_work_item_sizes[i]) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
}
}
auto local_size = size_t{1};
for (auto &item: local) { local_size *= item; }
if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
if (local_size > device.MaxWorkGroupSize()) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
}
// Make sure the global thread sizes are at least equal to the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
@ -45,7 +49,9 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Tests for local memory usage
const auto local_mem_usage = kernel.LocalMemUsage(device);
if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
if (!device.IsLocalMemoryValid(local_mem_usage)) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
}
// Prints the name of the kernel to launch in case of debugging in verbose mode
#ifdef VERBOSE
@ -55,9 +61,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
#endif
// Launches the kernel (and checks for launch errors)
try {
kernel.Launch(queue, global, local, event, waitForEvents);
} catch (...) { return StatusCode::kKernelLaunchError; }
kernel.Launch(queue, global, local, event, waitForEvents);
// Prints the elapsed execution time in case of debugging in verbose mode
#ifdef VERBOSE
@ -66,9 +70,6 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
#endif
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// =================================================================================================

View File

@ -27,29 +27,29 @@ namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents = {});
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents = {});
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
// to write to symmetric and triangular matrices through optional arguments.
template <typename T>
StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
const Database &db,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false) {
void PadCopyTransposeMatrix(Queue &queue, const Device &device,
const Database &db,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
@ -61,8 +61,8 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
if (do_transpose) {
if (use_fast_kernel &&
IsMultiple(src_ld, db["TRA_WPT"]) &&
IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) &&
IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) {
kernel_name = "TransposeMatrixFast";
}
else {
@ -84,77 +84,75 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
// Retrieves the kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(src_ld));
kernel.SetArgument(1, src());
kernel.SetArgument(2, dest());
kernel.SetArgument(3, GetRealArg(alpha));
}
else {
kernel.SetArgument(0, static_cast<int>(src_one));
kernel.SetArgument(1, static_cast<int>(src_two));
kernel.SetArgument(2, static_cast<int>(src_ld));
kernel.SetArgument(3, static_cast<int>(src_offset));
kernel.SetArgument(4, src());
kernel.SetArgument(5, static_cast<int>(dest_one));
kernel.SetArgument(6, static_cast<int>(dest_two));
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, GetRealArg(alpha));
if (do_pad) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(11, static_cast<int>(upper));
kernel.SetArgument(12, static_cast<int>(lower));
kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
// parameters in the database.
if (do_transpose) {
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(src_ld));
kernel.SetArgument(1, src());
kernel.SetArgument(2, dest());
kernel.SetArgument(3, GetRealArg(alpha));
const auto global = std::vector<size_t>{
dest_one / db["TRA_WPT"],
dest_two / db["TRA_WPT"]
};
const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
kernel.SetArgument(0, static_cast<int>(src_one));
kernel.SetArgument(1, static_cast<int>(src_two));
kernel.SetArgument(2, static_cast<int>(src_ld));
kernel.SetArgument(3, static_cast<int>(src_offset));
kernel.SetArgument(4, src());
kernel.SetArgument(5, static_cast<int>(dest_one));
kernel.SetArgument(6, static_cast<int>(dest_two));
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, GetRealArg(alpha));
if (do_pad) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(11, static_cast<int>(upper));
kernel.SetArgument(12, static_cast<int>(lower));
kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
};
const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
// parameters in the database.
if (do_transpose) {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["TRA_WPT"],
dest_two / db["TRA_WPT"]
};
const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
};
const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
}
else {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["COPY_VW"],
dest_two / db["COPY_WPT"]
};
const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["COPY_VW"],
dest_two / db["COPY_WPT"]
};
const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
};
const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
};
const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
} catch (...) { return StatusCode::kInvalidKernel; }
}
}
// =================================================================================================

View File

@ -22,74 +22,64 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xamax.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xamax<T>::DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xamax<T>::DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorIndex(1, imax_buffer, imax_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorIndex(1, imax_buffer, imax_offset);
// Retrieves the Xamax kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xamax");
auto kernel2 = Kernel(program, "XamaxEpilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xamax");
auto kernel2 = Kernel(program, "XamaxEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer1 = Buffer<T>(context_, temp_size);
auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer1 = Buffer<T>(context_, temp_size);
auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer1());
kernel1.SetArgument(5, temp_buffer2());
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer1());
kernel1.SetArgument(5, temp_buffer2());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer1());
kernel2.SetArgument(1, temp_buffer2());
kernel2.SetArgument(2, imax_buffer());
kernel2.SetArgument(3, static_cast<int>(imax_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer1());
kernel2.SetArgument(1, temp_buffer2());
kernel2.SetArgument(2, imax_buffer());
kernel2.SetArgument(3, static_cast<int>(imax_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xamax: public Routine {
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
// Templated-precision implementation of the routine
StatusCode DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -22,71 +22,61 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xasum.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xasum<T>::DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xasum<T>::DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorScalar(1, asum_buffer, asum_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorScalar(1, asum_buffer, asum_offset);
// Retrieves the Xasum kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xasum");
auto kernel2 = Kernel(program, "XasumEpilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xasum");
auto kernel2 = Kernel(program, "XasumEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, asum_buffer());
kernel2.SetArgument(2, static_cast<int>(asum_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, asum_buffer());
kernel2.SetArgument(2, static_cast<int>(asum_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xasum: public Routine {
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
// Templated-precision implementation of the routine
StatusCode DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xaxpy.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,45 +52,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
// Retrieves the Xaxpy kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xaxpy: public Routine {
Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
// Templated-precision implementation of the routine
StatusCode DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xcopy.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xcopy<T>::DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xcopy<T>::DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +52,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";
// Retrieves the Xcopy kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xcopy: public Routine {
Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
// Templated-precision implementation of the routine
StatusCode DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -22,79 +22,68 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xdot.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xdot<T>::DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate) {
void Xdot<T>::DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorScalar(1, dot_buffer, dot_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
TestVectorScalar(1, dot_buffer, dot_offset);
// Retrieves the Xdot kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xdot");
auto kernel2 = Kernel(program, "XdotEpilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xdot");
auto kernel2 = Kernel(program, "XdotEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, y_buffer());
kernel1.SetArgument(5, static_cast<int>(y_offset));
kernel1.SetArgument(6, static_cast<int>(y_inc));
kernel1.SetArgument(7, temp_buffer());
kernel1.SetArgument(8, static_cast<int>(do_conjugate));
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, y_buffer());
kernel1.SetArgument(5, static_cast<int>(y_offset));
kernel1.SetArgument(6, static_cast<int>(y_inc));
kernel1.SetArgument(7, temp_buffer());
kernel1.SetArgument(8, static_cast<int>(do_conjugate));
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, dot_buffer());
kernel2.SetArgument(2, static_cast<int>(dot_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, dot_buffer());
kernel2.SetArgument(2, static_cast<int>(dot_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,11 +28,11 @@ class Xdot: public Routine {
Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
// Templated-precision implementation of the routine
StatusCode DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate = false);
void DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate = false);
};
// =================================================================================================

View File

@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xdotc<T>::DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
return DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
true);
void Xdotc<T>::DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
true);
}
// =================================================================================================

View File

@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> {
Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
// Templated-precision implementation of the routine
StatusCode DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xdotu<T>::DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
return DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
false);
void Xdotu<T>::DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
false);
}
// =================================================================================================

View File

@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> {
Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
// Templated-precision implementation of the routine
StatusCode DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -35,10 +35,10 @@ class Xmax: public Xamax<T> {
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoMax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
void DoMax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
}
};

View File

@ -35,10 +35,10 @@ class Xmin: public Xamax<T> {
// Forwards to the regular max-absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoMin(const size_t n,
const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
void DoMin(const size_t n,
const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
}
};

View File

@ -22,71 +22,61 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xnrm2.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xnrm2<T>::DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xnrm2<T>::DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorScalar(1, nrm2_buffer, nrm2_offset);
// Retrieves the Xnrm2 kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xnrm2");
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xnrm2");
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, nrm2_buffer());
kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, nrm2_buffer());
kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xnrm2: public Routine {
Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
// Templated-precision implementation of the routine
StatusCode DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -22,26 +22,24 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xscal.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xscal<T>::DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vector for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -51,41 +49,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";
// Retrieves the Xscal kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,8 +28,8 @@ class Xscal: public Routine {
Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
// Templated-precision implementation of the routine
StatusCode DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -35,10 +35,10 @@ class Xsum: public Xasum<T> {
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoSum(const size_t n,
const Buffer<T> &sum_buffer, const size_t sum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
void DoSum(const size_t n,
const Buffer<T> &sum_buffer, const size_t sum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
}
};

View File

@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xswap.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xswap<T>::DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xswap<T>::DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +52,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";
// Retrieves the Xswap kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

Some files were not shown because too many files have changed in this diff Show More