Merge branch 'development' into netlib_blas_api

Conflicts:
	scripts/generator/generator.py
	scripts/generator/generator/routine.py
pull/125/head
Cedric Nugteren 2016-10-25 09:34:24 +02:00
commit 3b65eace0a
153 changed files with 8633 additions and 7576 deletions

View File

@ -1,8 +1,11 @@
Development version (next release)
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
- Changed the enums in the C API to avoid potential name clashes with external code
- Greatly improved the way exceptions are handled in the library (thanks to 'intelfx')
- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
- Fixed a bug in the tests and samples related to waiting for an invalid event
- Fixed a bug in the SYRK/SYR2K/HERK/HER2K routines that would occur with specific tuning parameters
- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
- Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS
- Added an option to run tuned kernels multiple times to average execution times

View File

@ -69,9 +69,7 @@ endif()
if(MSVC)
if(BUILD_SHARED_LIBS)
add_definitions(" /DCLBLAST_DLL")
else(BUILD_SHARED_LIBS)
add_definitions(" /DCLBLAST_STATIC")
endif(BUILD_SHARED_LIBS)
endif()
endif(MSVC)
# C++ compiler settings
@ -167,11 +165,12 @@ set(PRECISIONS 32 64 3232 6464 16)
set(SOURCES
src/database/database.cpp
src/routines/common.cpp
src/utilities/clblast_exceptions.cpp
src/utilities/utilities.cpp
src/cache.cpp
src/clblast.cpp
src/clblast_c.cpp
src/routine.cpp
src/utilities.cpp
)
foreach(ROUTINE ${LEVEL1_ROUTINES})
set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
@ -191,7 +190,7 @@ if(BUILD_SHARED_LIBS)
add_library(clblast SHARED ${SOURCES})
else(BUILD_SHARED_LIBS)
add_library(clblast STATIC ${SOURCES})
endif(BUILD_SHARED_LIBS)
endif()
target_link_libraries(clblast ${OPENCL_LIBRARIES})
@ -206,7 +205,7 @@ target_include_directories(clblast PUBLIC
if(MSVC)
if(BUILD_SHARED_LIBS)
target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
endif(BUILD_SHARED_LIBS)
endif()
endif()
# Installs the library
@ -218,9 +217,17 @@ install(FILES include/clblast_half.h DESTINATION include)
# Installs the config for find_package in dependent projects
install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)
# Install pkg-config file on Linux
if(UNIX)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/clblast.pc.in"
"${CMAKE_CURRENT_BINARY_DIR}/clblast.pc" @ONLY IMMEDIATE)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/clblast.pc
DESTINATION lib/pkgconfig)
endif()
# ==================================================================================================
# Sets a default platform ($DEVICEPLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests on
# Sets a default platform ($CLBLAST_PLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests
set(DEVICEPLATFORM )
if(DEFINED ENV{CLBLAST_DEVICE})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{CLBLAST_DEVICE})
@ -229,6 +236,12 @@ if(DEFINED ENV{CLBLAST_PLATFORM})
set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{CLBLAST_PLATFORM})
endif()
# Optionally also provides other options to the tests such as -full_test ($CLBLAST_TEST_ARGUMENTS)
set(TEST_ARGUMENTS )
if(DEFINED ENV{CLBLAST_TEST_ARGUMENTS})
set(TEST_ARGUMENTS $ENV{CLBLAST_TEST_ARGUMENTS})
endif()
# ==================================================================================================
# This section contains all the code related to the examples
@ -262,7 +275,7 @@ if(TUNERS)
# Visual Studio requires the sources of non-exported objects/libraries
set(TUNERS_COMMON )
if(MSVC)
set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities.cpp)
set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities/utilities.cpp)
endif()
# Adds tuning executables
@ -298,7 +311,7 @@ if(CLIENTS OR TESTS)
find_package(Threads)
set(REF_LIBRARIES ${REF_LIBRARIES} ${CLBLAS_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
set(REF_INCLUDES ${REF_INCLUDES} ${CLBLAS_INCLUDE_DIRS})
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
if(MSVC)
add_definitions(" /DCLBLAST_REF_CLBLAS")
else()
add_definitions(" -DCLBLAST_REF_CLBLAS")
@ -307,7 +320,7 @@ if(CLIENTS OR TESTS)
if(CBLAS_FOUND)
set(REF_INCLUDES ${REF_INCLUDES} ${CBLAS_INCLUDE_DIRS})
set(REF_LIBRARIES ${REF_LIBRARIES} ${CBLAS_LIBRARIES})
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
if(MSVC)
add_definitions(" /DCLBLAST_REF_CBLAS")
else()
add_definitions(" -DCLBLAST_REF_CBLAS")
@ -325,7 +338,7 @@ if(CLIENTS)
# Visual Studio requires the sources of non-exported objects/libraries
set(CLIENTS_COMMON )
if(MSVC)
set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities.cpp test/performance/client.cpp)
set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities/utilities.cpp test/performance/client.cpp)
else()
# Creates the common performance-tests objects (requires CMake 2.8.8)
add_library(test_performance_common OBJECT test/performance/client.cpp)
@ -372,7 +385,7 @@ if(TESTS)
# Visual Studio requires the sources of non-exported objects/libraries
set(TESTS_COMMON )
if(MSVC)
set(TESTS_COMMON ${TESTS_COMMON} src/utilities.cpp
set(TESTS_COMMON ${TESTS_COMMON} src/utilities/utilities.cpp
test/correctness/tester.cpp test/correctness/testblas.cpp)
else()
# Creates the common correctness-tests objects (requires CMake 2.8.8)
@ -405,14 +418,14 @@ if(TESTS)
target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM})
add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM} ${TEST_ARGUMENTS})
endforeach()
# Adds 'alltests' target: runs all tests
set(ALLTESTS )
set(ALLTESTSDEPENDS )
foreach(ROUTINE ${ROUTINES})
set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM})
set(ALLTESTS ${ALLTESTS} COMMAND clblast_test_${ROUTINE} ${DEVICEPLATFORM} ${TEST_ARGUMENTS})
set(ALLTESTSDEPENDS clblast_test_${ROUTINE})
endforeach()
add_custom_target(alltests ${ALLTESTS} DEPENDS ${ALLTESTSDEPENDS})

20
CONTRIBUTING.md 100644
View File

@ -0,0 +1,20 @@
CLBlast: Contributing guidelines
================
For information about the CLBlast library, see the [README](README.md) file instead.
Tuning results
-------------
A [dedicated GitHub issue](https://github.com/CNugteren/CLBlast/issues/1) is available to post new tuning results. If you compiled with the tuners (see the [README](README.md) for instructions), ran one of the tuners on your device (or all perhaps?), and feel that these results should be included in the next release of CLBlast, please post them there. You can do this by attaching the JSON files to the issue (archived in a .ZIP file).
Code improvements and additions
-------------
Pull requests are welcome as long as they:
* Contain unit additions or modifications
* Follow the CLBlast coding style, which is loosely based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers. We use a tab-size of 2 spaces and a max-width of 100 characters.
* Are made against the `development` branch.

214
LICENSE
View File

@ -1,21 +1,201 @@
MIT License
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
Copyright (c) 2016 Cedric Nugteren
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
1. Definitions.
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2015 Cedric Nugteren
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -111,8 +111,9 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- GeForce GTX 750 Ti
- GeForce GTX 980
- GeForce GTX 1070
- GeForce GTX Titan
- GeForce GTX Titan X
- GeForce GTX TITAN
- GeForce GTX TITAN Black
- GeForce GTX TITAN X
- Tesla K20m
- Tesla K40m
* AMD GPUs:
@ -121,6 +122,7 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
- Oland
- Pitcairn
- Tahiti
- Tonga
* Intel GPUs:
- HD Graphics 530
- HD Graphics 5500 BroadWell U-Processor GT2
@ -175,7 +177,7 @@ To build these tests, another BLAS library is needed to serve as a reference. Th
Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further.
All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake. Further options (e.g. `-full_test`) can be supplied through the `CLBLAST_TEST_ARGUMENTS` environmental variable.
Compiling the performance tests/clients (optional)
@ -284,7 +286,7 @@ The `samples/haxpy.c` example shows how to use these convencience functions when
Contributing
-------------
Contributions are welcome in the form of tuning results for OpenCL devices previously untested. Furthermore, merge requests are welcome as long as they contain unit additions or modifications. Furthermore, they should follow the CLBlast coding style, which is based on the [Google C++ style guide](https://google-styleguide.googlecode.com/svn/trunk/cppguide.html) and the Effective C++ books by Scott Meyers.
Contributions are welcome in the form of tuning results for OpenCL devices previously untested or pull requests. See [the contributing guidelines](CONTRIBUTING.md) for more details.
The contributing authors (code, pull requests, testing) so far are:
@ -296,6 +298,7 @@ The contributing authors (code, pull requests, testing) so far are:
* [Gian-Carlo Pascutto](https://github.com/gcp)
* [Ivan Shapovalov](https://github.com/intelfx)
* [Dimitri Van Assche](https://github.com/dvasschemacq)
* [Shehzan Mohammed](https://shehzan10.github.io)
Tuning and testing on a variety of OpenCL devices was made possible by:
@ -303,6 +306,7 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
* [ASCI DAS4 and DAS5](http://www.cs.vu.nl/das4/)
* [dividiti](http://www.dividiti.com)
* [SURFsara HPC center](http://www.surfsara.com)
* [ArrayFire](http://arrayfire.org)
Support us

10
clblast.pc.in 100644
View File

@ -0,0 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix}
includedir=${prefix}/include
libdir=${exec_prefix}/lib
Name: CLBlast
Description: CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11
Version: @clblast_VERSION_MAJOR@.@clblast_VERSION_MINOR@.@clblast_VERSION_PATCH@
Libs: -L${libdir} -lclblast
Cflags: -I${includedir}

File diff suppressed because it is too large Load Diff

View File

@ -46,14 +46,34 @@ enum class StatusCode {
// Status codes in common with the OpenCL standard
kSuccess = 0, // CL_SUCCESS
kOpenCLCompilerNotAvailable= -3, // CL_COMPILER_NOT_AVAILABLE
kTempBufferAllocFailure = -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
kBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
kOpenCLOutOfResources = -5, // CL_OUT_OF_RESOURCES
kOpenCLOutOfHostMemory = -6, // CL_OUT_OF_HOST_MEMORY
kOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
kInvalidValue = -30, // CL_INVALID_VALUE
kInvalidCommandQueue = -36, // CL_INVALID_COMMAND_QUEUE
kInvalidMemObject = -38, // CL_INVALID_MEM_OBJECT
kInvalidBinary = -42, // CL_INVALID_BINARY
kInvalidBuildOptions = -43, // CL_INVALID_BUILD_OPTIONS
kInvalidProgram = -44, // CL_INVALID_PROGRAM
kInvalidProgramExecutable = -45, // CL_INVALID_PROGRAM_EXECUTABLE
kInvalidKernelName = -46, // CL_INVALID_KERNEL_NAME
kInvalidKernelDefinition = -47, // CL_INVALID_KERNEL_DEFINITION
kInvalidKernel = -48, // CL_INVALID_KERNEL
kInvalidArgIndex = -49, // CL_INVALID_ARG_INDEX
kInvalidArgValue = -50, // CL_INVALID_ARG_VALUE
kInvalidArgSize = -51, // CL_INVALID_ARG_SIZE
kInvalidKernelArgs = -52, // CL_INVALID_KERNEL_ARGS
kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
kInvalidLocalThreadsTotal = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
kInvalidLocalThreadsDim = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
kInvalidTempBufferSize = -61, // CL_INVALID_BUFFER_SIZE
kInvalidGlobalOffset = -56, // CL_INVALID_GLOBAL_OFFSET
kInvalidEventWaitList = -57, // CL_INVALID_EVENT_WAIT_LIST
kInvalidEvent = -58, // CL_INVALID_EVENT
kInvalidOperation = -59, // CL_INVALID_OPERATION
kInvalidBufferSize = -61, // CL_INVALID_BUFFER_SIZE
kInvalidGlobalWorkSize = -63, // CL_INVALID_GLOBAL_WORK_SIZE
// Status codes in common with the clBLAS library
kNotImplemented = -1024, // Routine or functionality not implemented yet
@ -75,13 +95,14 @@ enum class StatusCode {
kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small
// Custom additional status codes for CLBlast
kKernelLaunchError = -2048, // Problem occurred when enqueuing the kernel
kKernelRunError = -2047, // Problem occurred while running the kernel
kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device
kNoHalfPrecision = -2045, // Half precision (16-bits) not supported by the device
kNoDoublePrecision = -2044, // Double precision (64-bits) not supported by the device
kInvalidVectorScalar = -2043, // The unit-sized vector is not a valid OpenCL buffer
kInsufficientMemoryScalar = -2042, // The unit-sized vector's OpenCL buffer is too small
kDatabaseError = -2041, // Entry for the device was not found in the database
kUnknownError = -2040, // A catch-all error code representing an unspecified error
kUnexpectedError = -2039, // A catch-all error code representing an unexpected exception
};
// Matrix layout and transpose types

File diff suppressed because it is too large Load Diff

View File

@ -106,13 +106,13 @@ void run_example_routine(const cl_device_id device) {
clock_t start = clock();
// Calls an example routine
StatusCode status = CLBlastSasum(n,
device_output, 0,
device_input, 0, 1,
&queue, &event);
CLBlastStatusCode status = CLBlastSasum(n,
device_output, 0,
device_input, 0, 1,
&queue, &event);
// Wait for completion
if (status == kSuccess) {
if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}

View File

@ -74,17 +74,17 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_y, CL_TRUE, 0, m*sizeof(double), host_y, 0, NULL, NULL);
// Call the DGEMV routine.
StatusCode status = CLBlastDgemv(kRowMajor, kNo,
m, n,
alpha,
device_a, 0, a_ld,
device_x, 0, 1,
beta,
device_y, 0, 1,
&queue, &event);
CLBlastStatusCode status = CLBlastDgemv(CLBlastLayoutRowMajor, CLBlastTransposeNo,
m, n,
alpha,
device_a, 0, a_ld,
device_x, 0, 1,
beta,
device_y, 0, 1,
&queue, &event);
// Wait for completion
if (status == kSuccess) {
if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}

View File

@ -71,13 +71,13 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
// Call the HAXPY routine.
StatusCode status = CLBlastHaxpy(n, alpha,
device_a, 0, 1,
device_b, 0, 1,
&queue, &event);
CLBlastStatusCode status = CLBlastHaxpy(n, alpha,
device_a, 0, 1,
device_b, 0, 1,
&queue, &event);
// Wait for completion
if (status == kSuccess) {
if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}

View File

@ -67,13 +67,13 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
// Call the SASUM routine.
StatusCode status = CLBlastSasum(n,
device_output, 0,
device_input, 0, 1,
&queue, &event);
CLBlastStatusCode status = CLBlastSasum(n,
device_output, 0,
device_input, 0, 1,
&queue, &event);
// Wait for completion
if (status == kSuccess) {
if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}

View File

@ -77,17 +77,18 @@ int main(void) {
clEnqueueWriteBuffer(queue, device_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL);
// Call the SGEMM routine.
StatusCode status = CLBlastSgemm(kRowMajor, kNo, kNo,
m, n, k,
alpha,
device_a, 0, a_ld,
device_b, 0, b_ld,
beta,
device_c, 0, c_ld,
&queue, &event);
CLBlastStatusCode status = CLBlastSgemm(CLBlastLayoutRowMajor,
CLBlastTransposeNo, CLBlastTransposeNo,
m, n, k,
alpha,
device_a, 0, a_ld,
device_b, 0, b_ld,
beta,
device_c, 0, c_ld,
&queue, &event);
// Wait for completion
if (status == kSuccess) {
if (status == CLBlastSuccess) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
}

View File

@ -31,9 +31,18 @@ import generator.doc as doc
from generator.routine import Routine
from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU
HEADER_LINES = [96, 73, 97, 22, 29, 41, 43, 1]
FOOTER_LINES = [17, 75, 19, 14, 6, 6, 10, 1]
FILES = [
"/include/clblast.h",
"/src/clblast.cpp",
"/include/clblast_c.h",
"/src/clblast_c.cpp",
"/test/wrapper_clblas.hpp",
"/test/wrapper_cblas.hpp",
"/include/clblast_blas.h",
"/src/clblast_blas.cpp",
]
HEADER_LINES = [117, 73, 118, 22, 29, 41, 43, 1]
FOOTER_LINES = [17, 80, 19, 18, 6, 6, 10, 1]
# Different possibilities for requirements
ald_m = "The value of `a_ld` must be at least `m`."
@ -126,35 +135,23 @@ def main(argv):
cl_args = parser.parse_args(argv)
library_root = cl_args.clblast_root
# Sets all the files the output
files = [
library_root + "/include/clblast.h",
library_root + "/src/clblast.cpp",
library_root + "/include/clblast_c.h",
library_root + "/src/clblast_c.cpp",
library_root + "/test/wrapper_clblas.hpp",
library_root + "/test/wrapper_cblas.hpp",
library_root + "/include/clblast_blas.h",
library_root + "/src/clblast_blas.cpp",
]
# Checks whether the command-line arguments are valid; exists otherwise
for f in files:
if not os.path.isfile(f):
for f in FILES:
if not os.path.isfile(library_root + f):
print("[ERROR] The path '" + library_root + "' does not point to the root of the CLBlast library")
sys.exit()
# Iterates over all regular files to output
for i in range(0, len(files)):
for i in range(0, len(FILES)):
# Stores the header and the footer of the original file
with open(files[i]) as f:
with open(library_root + FILES[i]) as f:
original = f.readlines()
file_header = original[:HEADER_LINES[i]]
file_footer = original[-FOOTER_LINES[i]:]
# Re-writes the body of the file
with open(files[i], "w") as f:
with open(library_root + FILES[i], "w") as f:
body = ""
levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4]
for level in levels:

View File

@ -45,17 +45,18 @@ def clblast_h(routine):
def clblast_cc(routine):
"""The C++ API implementation (.cpp)"""
indent1 = " " * (20 + routine.length())
indent1 = " " * (15 + routine.length())
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
if routine.implemented:
result += routine.routine_header_cpp(12, "") + " {" + NL
result += " auto queue_cpp = Queue(*queue);" + NL
result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
result += " auto status = routine.SetUp();" + NL
result += " if (status != StatusCode::kSuccess) { return status; }" + NL
result += " return routine.Do" + routine.name.capitalize() + "("
result += " try {" + NL
result += " auto queue_cpp = Queue(*queue);" + NL
result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
result += " routine.Do" + routine.name.capitalize() + "("
result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
result += ");" + NL
result += " return StatusCode::kSuccess;" + NL
result += " } catch (...) { return DispatchException(); }" + NL
else:
result += routine.routine_header_type_cpp(12) + " {" + NL
result += " return StatusCode::kNotImplemented;" + NL
@ -72,7 +73,7 @@ def clblast_c_h(routine):
"""The C API header (.h)"""
result = NL + "// " + routine.description + ": " + routine.short_names() + NL
for flavour in routine.flavours:
result += routine.routine_header_c(flavour, 31, " PUBLIC_API") + ";" + NL
result += routine.routine_header_c(flavour, 38, " PUBLIC_API") + ";" + NL
return result
@ -81,12 +82,16 @@ def clblast_c_cc(routine):
result = NL + "// " + routine.name.upper() + NL
for flavour in routine.flavours:
template = "<" + flavour.template + ">" if routine.no_scalars() else ""
indent = " " * (26 + routine.length() + len(template))
result += routine.routine_header_c(flavour, 20, "") + " {" + NL
result += " auto status = clblast::" + routine.name.capitalize() + template + "("
indent = " " * (16 + routine.length() + len(template))
result += routine.routine_header_c(flavour, 27, "") + " {" + NL
result += " try {" + NL
result += " return static_cast<CLBlastStatusCode>(" + NL
result += " clblast::" + routine.name.capitalize() + template + "("
result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
result += "," + NL + indent + "queue, event);"
result += NL + " return static_cast<StatusCode>(status);" + NL + "}" + NL
result += "," + NL + indent + "queue, event)" + NL
result += " );" + NL
result += " } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); }" + NL
result += "}" + NL
return result

View File

@ -32,7 +32,7 @@ def generate(routine):
result += "C API:" + NL
result += "```" + NL
for flavour in routine.flavours:
result += routine.routine_header_c(flavour, 20, "") + NL
result += routine.routine_header_c(flavour, 27, "") + NL
result += "```" + NL + NL
# Routine arguments

View File

@ -390,6 +390,13 @@ class Routine:
return [", ".join(definitions)]
return []
def options_def_c(self):
"""As above, but now for the C API"""
if self.options:
definitions = ["const CLBlast" + convert.option_to_clblast(o) + " " + o for o in self.options]
return [", ".join(definitions)]
return []
def options_def_wrapper_clblas(self):
"""As above, but now using clBLAS data-types"""
if self.options:
@ -505,6 +512,17 @@ class Routine:
list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
def arguments_def_c(self, flavour):
"""As above, but for the C API"""
return (self.options_def_c() + self.sizes_def() +
list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) +
self.scalar_def("alpha", flavour) +
list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) +
self.scalar_def("beta", flavour) +
list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
def arguments_def_wrapper_clblas(self, flavour):
"""As above, but clBLAS wrapper plain data-types"""
return (self.options_def_wrapper_clblas() + self.sizes_def() +
@ -575,8 +593,8 @@ class Routine:
def routine_header_c(self, flavour, spaces, extra_qualifier):
"""As above, but now for C"""
indent = " " * (spaces + self.length())
result = "StatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
result += (",\n" + indent).join([a for a in self.arguments_def(flavour)])
result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
result += (",\n" + indent).join([a for a in self.arguments_def_c(flavour)])
result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
return result

View File

@ -1,121 +0,0 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
// templated and thus header-only.
//
// =================================================================================================
#ifndef CLBLAST_BUFFER_TEST_H_
#define CLBLAST_BUFFER_TEST_H_
#include "clblast.h"
namespace clblast {
// =================================================================================================
// Tests matrix 'A' for validity
template <typename T>
StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
return StatusCode::kSuccess;
}
// Tests matrix 'B' for validity
template <typename T>
StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
} catch (...) { return StatusCode::kInvalidMatrixB; }
return StatusCode::kSuccess;
}
// Tests matrix 'C' for validity
template <typename T>
StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
const size_t offset, const size_t ld) {
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
} catch (...) { return StatusCode::kInvalidMatrixC; }
return StatusCode::kSuccess;
}
// Tests matrix 'AP' for validity
template <typename T>
StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
} catch (...) { return StatusCode::kInvalidMatrixA; }
return StatusCode::kSuccess;
}
// =================================================================================================
// Tests vector 'X' for validity
template <typename T>
StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc) {
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
try {
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
} catch (...) { return StatusCode::kInvalidVectorX; }
return StatusCode::kSuccess;
}
// Tests vector 'Y' for validity
template <typename T>
StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
const size_t inc) {
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
try {
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
} catch (...) { return StatusCode::kInvalidVectorY; }
return StatusCode::kSuccess;
}
// =================================================================================================
// Tests vector 'scalar' for validity
template <typename T>
StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (n + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
} catch (...) { return StatusCode::kInvalidVectorScalar; }
return StatusCode::kSuccess;
}
// Tests vector 'index' for validity
template <typename T>
StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
try {
const auto required_size = (n + offset) * sizeof(T);
if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
} catch (...) { return StatusCode::kInvalidVectorScalar; }
return StatusCode::kSuccess;
}
// =================================================================================================
} // namespace clblast
// CLBLAST_BUFFER_TEST_H_
#endif

View File

@ -57,7 +57,7 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
}
}
binary_cache_mutex_.unlock();
throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none.");
throw LogicError("GetBinaryFromCache: Expected binary in cache, but found none");
}
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
@ -75,7 +75,7 @@ const Program& GetProgramFromCache(const Context &context, const Precision &prec
}
}
program_cache_mutex_.unlock();
throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none.");
throw LogicError("GetProgramFromCache: Expected program in cache, but found none");
}
// Queries the cache to see whether or not the compiled kernel is already there
@ -109,14 +109,13 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
// =================================================================================================
// Clears the cache of stored binaries and programs
StatusCode CacheClearAll() {
void CacheClearAll() {
binary_cache_mutex_.lock();
binary_cache_.clear();
binary_cache_mutex_.unlock();
program_cache_mutex_.lock();
program_cache_.clear();
program_cache_mutex_.unlock();
return StatusCode::kSuccess;
}
// =================================================================================================

View File

@ -18,7 +18,7 @@
#include <vector>
#include <mutex>
#include "utilities.hpp"
#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================
@ -89,7 +89,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
// =================================================================================================
// Clears the cache of stored binaries
StatusCode CacheClearAll();
void CacheClearAll();
// =================================================================================================
} // namespace clblast

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -41,8 +41,8 @@
#include <string> // std::string
#include <vector> // std::vector
#include <memory> // std::shared_ptr
#include <stdexcept> // std::runtime_error
#include <numeric> // std::accumulate
#include <cstring> // std::strlen
// OpenCL
#if defined(__APPLE__) || defined(__MACOSX)
@ -51,20 +51,41 @@
#include <CL/opencl.h>
#endif
// Exception classes
#include "cxpp11_common.hpp"
namespace clblast {
// =================================================================================================
// Error occurred in the C++11 OpenCL header (this file)
inline void Error(const std::string &message) {
throw std::runtime_error("Internal OpenCL error: "+message);
}
// Represents a runtime error returned by an OpenCL API function
class CLError : public ErrorCode<DeviceError, cl_int> {
public:
explicit CLError(cl_int status, const std::string &where):
ErrorCode(status,
where,
"OpenCL error: " + where + ": " + std::to_string(static_cast<int>(status))) {
}
static void Check(const cl_int status, const std::string &where) {
if (status != CL_SUCCESS) {
throw CLError(status, where);
}
}
static void CheckDtor(const cl_int status, const std::string &where) {
if (status != CL_SUCCESS) {
fprintf(stderr, "CLBlast: %s (ignoring)\n", CLError(status, where).what());
}
}
};
// =================================================================================================
// Error occurred in OpenCL
inline void CheckError(const cl_int status) {
if (status != CL_SUCCESS) {
throw std::runtime_error("Internal OpenCL error: "+std::to_string(status));
}
}
#define CheckError(call) CLError::Check(call, CLError::TrimCallString(#call))
// Error occured in OpenCL (no-exception version for destructors)
#define CheckErrorDtor(call) CLError::CheckDtor(call, CLError::TrimCallString(#call))
// =================================================================================================
@ -81,7 +102,7 @@ class Event {
// Regular constructor with memory management
explicit Event():
event_(new cl_event, [](cl_event* e) {
if (*e) { CheckError(clReleaseEvent(*e)); }
if (*e) { CheckErrorDtor(clReleaseEvent(*e)); }
delete e;
}) {
*event_ = nullptr;
@ -92,16 +113,17 @@ class Event {
CheckError(clWaitForEvents(1, &(*event_)));
}
// Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
// the 'clGetEventProfilingInfo' function, since there is a bug in Apple's OpenCL implementation:
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
// Retrieves the elapsed time of the last recorded event.
// (Note that there is a bug in Apple's OpenCL implementation of the 'clGetEventProfilingInfo' function:
// http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx)
// However, in our case the reply size is fixed to be cl_ulong, so we are not affected.
float GetElapsedTime() const {
WaitForCompletion();
const auto bytes = sizeof(cl_ulong);
auto time_start = cl_ulong{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr));
auto time_end = cl_ulong{0};
clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
CheckError(clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr));
return static_cast<float>(time_end - time_start) * 1.0e-6f;
}
@ -130,10 +152,14 @@ class Platform {
explicit Platform(const size_t platform_id) {
auto num_platforms = cl_uint{0};
CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
if (num_platforms == 0) { Error("no platforms found"); }
if (num_platforms == 0) {
throw RuntimeError("Platform: no platforms found");
}
if (platform_id >= num_platforms) {
throw RuntimeError("Platform: invalid platform ID "+std::to_string(platform_id));
}
auto platforms = std::vector<cl_platform_id>(num_platforms);
CheckError(clGetPlatformIDs(num_platforms, platforms.data(), nullptr));
if (platform_id >= num_platforms) { Error("invalid platform ID "+std::to_string(platform_id)); }
platform_ = platforms[platform_id];
}
@ -173,11 +199,16 @@ class Device {
// Initialize the device. Note that this constructor can throw exceptions!
explicit Device(const Platform &platform, const size_t device_id) {
auto num_devices = platform.NumDevices();
if (num_devices == 0) { Error("no devices found"); }
if (num_devices == 0) {
throw RuntimeError("Device: no devices found");
}
if (device_id >= num_devices) {
throw RuntimeError("Device: invalid device ID "+std::to_string(device_id));
}
auto devices = std::vector<cl_device_id>(num_devices);
CheckError(clGetDeviceIDs(platform(), CL_DEVICE_TYPE_ALL, static_cast<cl_uint>(num_devices),
devices.data(), nullptr));
if (device_id >= num_devices) { Error("invalid device ID "+std::to_string(device_id)); }
device_ = devices[device_id];
}
@ -282,7 +313,8 @@ class Device {
auto result = std::string{};
result.resize(bytes);
CheckError(clGetDeviceInfo(device_, info, bytes, &result[0], nullptr));
return std::string{result.c_str()}; // Removes any trailing '\0'-characters
result.resize(strlen(result.c_str())); // Removes any trailing '\0'-characters
return result;
}
};
@ -300,11 +332,11 @@ class Context {
// Regular constructor with memory management
explicit Context(const Device &device):
context_(new cl_context, [](cl_context* c) { CheckError(clReleaseContext(*c)); delete c; }) {
context_(new cl_context, [](cl_context* c) { CheckErrorDtor(clReleaseContext(*c)); delete c; }) {
auto status = CL_SUCCESS;
const cl_device_id dev = device();
*context_ = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &status);
CheckError(status);
CLError::Check(status, "clCreateContext");
}
// Accessor to the private data-member
@ -329,18 +361,18 @@ class Program {
// Source-based constructor with memory management
explicit Program(const Context &context, std::string source):
program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
length_(source.length()),
source_(std::move(source)),
source_ptr_(&source_[0]) {
auto status = CL_SUCCESS;
*program_ = clCreateProgramWithSource(context(), 1, &source_ptr_, &length_, &status);
CheckError(status);
CLError::Check(status, "clCreateProgramWithSource");
}
// Binary-based constructor with memory management
explicit Program(const Device &device, const Context &context, const std::string& binary):
program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }),
program_(new cl_program, [](cl_program* p) { CheckErrorDtor(clReleaseProgram(*p)); delete p; }),
length_(binary.length()),
source_(binary),
source_ptr_(&source_[0]) {
@ -350,25 +382,15 @@ class Program {
*program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_,
reinterpret_cast<const unsigned char**>(&source_ptr_),
&status1, &status2);
CheckError(status1);
CheckError(status2);
CLError::Check(status1, "clCreateProgramWithBinary (binary status)");
CLError::Check(status2, "clCreateProgramWithBinary");
}
// Compiles the device program and returns whether or not there where any warnings/errors
BuildStatus Build(const Device &device, std::vector<std::string> &options) {
void Build(const Device &device, std::vector<std::string> &options) {
auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
const cl_device_id dev = device();
auto status = clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr);
if (status == CL_BUILD_PROGRAM_FAILURE) {
return BuildStatus::kError;
}
else if (status == CL_INVALID_BINARY) {
return BuildStatus::kInvalid;
}
else {
CheckError(status);
return BuildStatus::kSuccess;
}
CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
}
// Retrieves the warning/error message from the compiler (if any)
@ -416,7 +438,7 @@ class Queue {
// Regular constructor with memory management
explicit Queue(const Context &context, const Device &device):
queue_(new cl_command_queue, [](cl_command_queue* s) { CheckError(clReleaseCommandQueue(*s));
queue_(new cl_command_queue, [](cl_command_queue* s) { CheckErrorDtor(clReleaseCommandQueue(*s));
delete s; }) {
auto status = CL_SUCCESS;
#ifdef CL_VERSION_2_0
@ -425,15 +447,17 @@ class Queue {
{
cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
*queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
CLError::Check(status, "clCreateCommandQueueWithProperties");
}
else
{
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
CLError::Check(status, "clCreateCommandQueue");
}
#else
*queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
CLError::Check(status, "clCreateCommandQueue");
#endif
CheckError(status);
}
// Synchronizes the queue
@ -525,7 +549,7 @@ class Buffer {
if (access_ == BufferAccess::kWriteOnly) { flags = CL_MEM_WRITE_ONLY; }
auto status = CL_SUCCESS;
*buffer_ = clCreateBuffer(context(), flags, size*sizeof(T), nullptr, &status);
CheckError(status);
CLError::Check(status, "clCreateBuffer");
}
// As above, but now with read/write access as a default
@ -546,18 +570,24 @@ class Buffer {
// Copies from device to host: reading the device buffer a-synchronously
void ReadAsync(const Queue &queue, const size_t size, T* host, const size_t offset = 0) const {
if (access_ == BufferAccess::kWriteOnly) { Error("reading from a write-only buffer"); }
if (access_ == BufferAccess::kWriteOnly) {
throw LogicError("Buffer: reading from a write-only buffer");
}
CheckError(clEnqueueReadBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
void ReadAsync(const Queue &queue, const size_t size, std::vector<T> &host,
const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
if (host.size() < size) {
throw LogicError("Buffer: target host buffer is too small");
}
ReadAsync(queue, size, host.data(), offset);
}
void ReadAsync(const Queue &queue, const size_t size, BufferHost<T> &host,
const size_t offset = 0) const {
if (host.size() < size) { Error("target host buffer is too small"); }
if (host.size() < size) {
throw LogicError("Buffer: target host buffer is too small");
}
ReadAsync(queue, size, host.data(), offset);
}
@ -577,8 +607,12 @@ class Buffer {
// Copies from host to device: writing the device buffer a-synchronously
void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) {
if (access_ == BufferAccess::kReadOnly) { Error("writing to a read-only buffer"); }
if (GetSize() < (offset+size)*sizeof(T)) { Error("target device buffer is too small"); }
if (access_ == BufferAccess::kReadOnly) {
throw LogicError("Buffer: writing to a read-only buffer");
}
if (GetSize() < (offset+size)*sizeof(T)) {
throw LogicError("Buffer: target device buffer is too small");
}
CheckError(clEnqueueWriteBuffer(queue(), *buffer_, CL_FALSE, offset*sizeof(T), size*sizeof(T),
host, 0, nullptr, nullptr));
}
@ -644,10 +678,10 @@ class Kernel {
// Regular constructor with memory management
explicit Kernel(const Program &program, const std::string &name):
kernel_(new cl_kernel, [](cl_kernel* k) { CheckError(clReleaseKernel(*k)); delete k; }) {
kernel_(new cl_kernel, [](cl_kernel* k) { CheckErrorDtor(clReleaseKernel(*k)); delete k; }) {
auto status = CL_SUCCESS;
*kernel_ = clCreateKernel(program(), name.c_str(), &status);
CheckError(status);
CLError::Check(status, "clCreateKernel");
}
// Sets a kernel argument at the indicated position

View File

@ -0,0 +1,109 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Ivan Shapovalov <intelfx@intelfx.name>
//
// This file contains exception classes corresponding to 'clpp11.hpp'. It is also part of the
// CLCudaAPI project. See 'clpp11.hpp' for more details.
//
// =================================================================================================
#ifndef CLBLAST_CXPP11_COMMON_H_
#define CLBLAST_CXPP11_COMMON_H_
#include <string> // std::string
#include <stdexcept> // std::runtime_error
namespace clblast {
// =================================================================================================
// Basic exception class: represents an error happened inside our code
// (as opposed to an error in C++ runtime)
template <typename Base>
class Error : public Base {
public:
// Perfect forwarding of the constructor since "using Base::Base" is not supported by VS 2013
template <typename... Args>
Error(Args&&... args):
Base(std::forward<Args>(args)...) {
}
};
// =================================================================================================
// Represents a generic device-specific runtime error (returned by an OpenCL or CUDA API function)
class DeviceError : public Error<std::runtime_error> {
public:
// Perfect forwarding of the constructor since "using Error<std::runtime_error>::Error" is not
// supported by VS 2013
template <typename... Args>
DeviceError(Args&&... args):
Error<std::runtime_error>(std::forward<Args>(args)...) {
}
static std::string TrimCallString(const char *where) {
const char *paren = strchr(where, '(');
if (paren) {
return std::string(where, paren);
} else {
return std::string(where);
}
}
};
// =================================================================================================
// Represents a generic runtime error (aka environmental problem)
class RuntimeError : public Error<std::runtime_error> {
public:
explicit RuntimeError(const std::string &reason):
Error("Run-time error: " + reason) {
}
};
// =================================================================================================
// Represents a generic logic error (aka failed assertion)
class LogicError : public Error<std::logic_error> {
public:
explicit LogicError(const std::string &reason):
Error("Internal logic error: " + reason) {
}
};
// =================================================================================================
// Internal exception base class with a status field and a subclass-specific "details" field
// which can be used to recreate an exception
template <typename Base, typename Status>
class ErrorCode : public Base {
public:
ErrorCode(Status status, const std::string &details, const std::string &reason):
Base(reason),
status_(status),
details_(details) {
}
Status status() const {
return status_;
}
const std::string& details() const {
return details_;
}
private:
const Status status_;
const std::string details_;
};
// =================================================================================================
} // namespace clblast
// CLBLAST_CXPP11_COMMON_H_
#endif

View File

@ -11,7 +11,7 @@
//
// =================================================================================================
#include "utilities.hpp"
#include "utilities/utilities.hpp"
#include "database/database.hpp"
#include "database/kernels/xaxpy.hpp"
@ -92,7 +92,7 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
}
}
if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
if (!search_result) { throw RuntimeErrorCode(StatusCode::kDatabaseError); }
}
}

View File

@ -21,7 +21,7 @@
#include <vector>
#include <unordered_map>
#include "utilities.hpp"
#include "utilities/utilities.hpp"
namespace clblast {
// =================================================================================================

View File

@ -43,6 +43,7 @@ const Database::DatabaseEntry CopySingle = {
{ "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Tonga", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
}
},
@ -89,6 +90,7 @@ const Database::DatabaseEntry CopySingle = {
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
@ -114,6 +116,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
{ "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Pitcairn", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
@ -152,6 +155,7 @@ const Database::DatabaseEntry CopyComplexSingle = {
{ "GeForce GTX 750", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
{ "Tesla K40m", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@ -177,6 +181,7 @@ const Database::DatabaseEntry CopyDouble = {
{ "Oland", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "Pitcairn", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tonga", { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
@ -211,15 +216,16 @@ const Database::DatabaseEntry CopyDouble = {
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
}
},
}
@ -236,6 +242,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
{ "Oland", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Pitcairn", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tahiti", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tonga", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
}
},
@ -270,6 +277,7 @@ const Database::DatabaseEntry CopyComplexDouble = {
{ "GeForce GTX 750 Ti", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX 980", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "GeForce GTX TITAN X", { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
{ "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
{ "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },

View File

@ -43,7 +43,8 @@ const Database::DatabaseEntry PadSingle = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Pitcairn", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
}
},
{ // ARM GPUs
@ -89,6 +90,7 @@ const Database::DatabaseEntry PadSingle = {
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@ -114,6 +116,7 @@ const Database::DatabaseEntry PadComplexSingle = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tonga", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
@ -160,10 +163,11 @@ const Database::DatabaseEntry PadComplexSingle = {
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
}
},
{ // Default
@ -185,7 +189,8 @@ const Database::DatabaseEntry PadDouble = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tahiti", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tonga", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@ -219,6 +224,7 @@ const Database::DatabaseEntry PadDouble = {
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K40m", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
@ -244,7 +250,8 @@ const Database::DatabaseEntry PadComplexDouble = {
{ "Oland", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "Pitcairn", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tahiti", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tonga", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
}
},
{ // ARM GPUs
@ -278,6 +285,7 @@ const Database::DatabaseEntry PadComplexDouble = {
{ "GeForce GTX 750 Ti", { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX 980", { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "GeForce GTX TITAN", { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "GeForce GTX TITAN Black", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
{ "GeForce GTX TITAN X", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
{ "Tesla K20m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
{ "Tesla K40m", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },

View File

@ -43,6 +43,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
@ -89,6 +90,7 @@ const Database::DatabaseEntry PadtransposeSingle = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
@ -114,6 +116,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
@ -160,6 +163,7 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -185,6 +189,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
}
},
@ -219,6 +224,7 @@ const Database::DatabaseEntry PadtransposeDouble = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@ -244,6 +250,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
{ "Oland", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Pitcairn", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
{ "Tahiti", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "Tonga", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
}
},
@ -278,6 +285,7 @@ const Database::DatabaseEntry PadtransposeComplexDouble = {
{ "GeForce GTX 750 Ti", { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
{ "GeForce GTX 980", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
{ "Tesla K20m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
{ "Tesla K40m", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },

View File

@ -43,7 +43,8 @@ const Database::DatabaseEntry TransposeSingle = {
{ "Oland", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Pitcairn", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Tonga", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
{ // ARM GPUs
@ -89,6 +90,7 @@ const Database::DatabaseEntry TransposeSingle = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K20m", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "Tesla K40m", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@ -114,6 +116,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
{ "Oland", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tonga", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
}
},
@ -154,6 +157,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@ -162,7 +166,7 @@ const Database::DatabaseEntry TransposeComplexSingle = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
}
},
}
@ -179,6 +183,7 @@ const Database::DatabaseEntry TransposeDouble = {
{ "Oland", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "Tahiti", { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "Tonga", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
}
},
@ -213,6 +218,7 @@ const Database::DatabaseEntry TransposeDouble = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX 980", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
@ -238,7 +244,8 @@ const Database::DatabaseEntry TransposeComplexDouble = {
{ "Oland", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Pitcairn", { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tahiti", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
{ "Tonga", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
{ "default", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
}
},
{ // ARM GPUs
@ -266,6 +273,7 @@ const Database::DatabaseEntry TransposeComplexDouble = {
{ "GeForce GTX 750 Ti", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX 980", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN Black", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "GeForce GTX TITAN X", { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K20m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
{ "Tesla K40m", { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },

View File

@ -43,7 +43,8 @@ const Database::DatabaseEntry XaxpySingle = {
{ "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Pitcairn", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",256}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
{ "default", { {"VW",2}, {"WGS",64}, {"WPT",2} } },
}
},
{ // ARM GPUs
@ -89,6 +90,7 @@ const Database::DatabaseEntry XaxpySingle = {
{ "GeForce GTX 750 Ti", { {"VW",2}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN Black", { {"VW",4}, {"WGS",128}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",4}, {"WGS",128}, {"WPT",1} } },
@ -97,7 +99,7 @@ const Database::DatabaseEntry XaxpySingle = {
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",4}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",4}, {"WGS",256}, {"WPT",1} } },
}
},
}
@ -114,6 +116,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
{ "Oland", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",256}, {"WPT",8} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
@ -160,6 +163,7 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN Black", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
@ -185,6 +189,7 @@ const Database::DatabaseEntry XaxpyDouble = {
{ "Oland", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",128}, {"WPT",4} } },
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
}
},
@ -219,15 +224,16 @@ const Database::DatabaseEntry XaxpyDouble = {
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",256}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN Black", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",512}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",2}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "default", { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",2} } },
}
},
}
@ -244,6 +250,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
{ "Oland", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
{ "Pitcairn", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tahiti", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
{ "Tonga", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "default", { {"VW",1}, {"WGS",128}, {"WPT",1} } },
}
},
@ -278,6 +285,7 @@ const Database::DatabaseEntry XaxpyComplexDouble = {
{ "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",64}, {"WPT",2} } },
{ "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"VW",1}, {"WGS",64}, {"WPT",4} } },
{ "GeForce GTX TITAN Black", { {"VW",1}, {"WGS",128}, {"WPT",4} } },
{ "GeForce GTX TITAN X", { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
{ "Tesla K20m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
{ "Tesla K40m", { {"VW",1}, {"WGS",64}, {"WPT",1} } },

View File

@ -42,6 +42,7 @@ const Database::DatabaseEntry XdotSingle = {
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",128}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
}
},
@ -72,6 +73,7 @@ const Database::DatabaseEntry XdotSingle = {
{ "GeForce GTX 750", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX TITAN Black", { {"WGS1",512}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",1024}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",256} } },
@ -95,7 +97,8 @@ const Database::DatabaseEntry XdotComplexSingle = {
{ "Oland", { {"WGS1",128}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",256}, {"WGS2",64} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
}
},
{ // Intel CPUs
@ -125,6 +128,7 @@ const Database::DatabaseEntry XdotComplexSingle = {
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",256}, {"WGS2",64} } },
{ "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",512}, {"WGS2",64} } },
@ -148,7 +152,8 @@ const Database::DatabaseEntry XdotDouble = {
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",128}, {"WGS2",64} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },
}
},
{ // Intel CPUs
@ -167,9 +172,10 @@ const Database::DatabaseEntry XdotDouble = {
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",256} } },
{ "GeForce GTX 750 Ti", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",64} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",512}, {"WGS2",32} } },
{ "default", { {"WGS1",256}, {"WGS2",64} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },
}
},
{ // Default
@ -190,6 +196,7 @@ const Database::DatabaseEntry XdotComplexDouble = {
{ "Oland", { {"WGS1",256}, {"WGS2",32} } },
{ "Pitcairn", { {"WGS1",256}, {"WGS2",32} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",32} } },
{ "Tonga", { {"WGS1",128}, {"WGS2",64} } },
{ "default", { {"WGS1",256}, {"WGS2",32} } },
}
},
@ -209,6 +216,7 @@ const Database::DatabaseEntry XdotComplexDouble = {
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",32} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WGS2",32} } },
{ "GeForce GTX TITAN Black", { {"WGS1",128}, {"WGS2",32} } },
{ "GeForce GTX TITAN X", { {"WGS1",128}, {"WGS2",32} } },
{ "Tesla K20m", { {"WGS1",128}, {"WGS2",32} } },
{ "default", { {"WGS1",128}, {"WGS2",64} } },

View File

@ -36,6 +36,7 @@ const Database::DatabaseEntry XgemmSingle = {
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
@ -82,6 +83,7 @@ const Database::DatabaseEntry XgemmSingle = {
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
@ -107,6 +109,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
{ "Oland", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
@ -153,6 +156,7 @@ const Database::DatabaseEntry XgemmComplexSingle = {
{ "GeForce GTX 750 Ti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@ -178,6 +182,7 @@ const Database::DatabaseEntry XgemmDouble = {
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
{ "Tahiti", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
{ "Tonga", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
@ -212,6 +217,7 @@ const Database::DatabaseEntry XgemmDouble = {
{ "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
{ "GeForce GTX 980", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
{ "GeForce GTX TITAN", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
@ -237,6 +243,7 @@ const Database::DatabaseEntry XgemmComplexDouble = {
{ "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tahiti", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tonga", { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
@ -270,6 +277,7 @@ const Database::DatabaseEntry XgemmComplexDouble = {
{ "GeForce GTX 750", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
{ "GeForce GTX 750 Ti", { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX 980", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
{ "GeForce GTX TITAN Black", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
{ "GeForce GTX TITAN X", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K20m", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
{ "Tesla K40m", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },

View File

@ -19,7 +19,7 @@ const Database::DatabaseEntry XgemmDirectHalf = {
"XgemmDirect", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
}
},
}
@ -32,7 +32,8 @@ const Database::DatabaseEntry XgemmDirectSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "Tonga", { {"KWID",16}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",32}, {"NDIMCD",8}, {"PADA",0}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // Intel GPUs
@ -44,12 +45,13 @@ const Database::DatabaseEntry XgemmDirectSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
}
},
}
@ -62,7 +64,8 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
}
},
{ // Intel GPUs
@ -74,12 +77,13 @@ const Database::DatabaseEntry XgemmDirectComplexSingle = {
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
}
},
}
@ -92,18 +96,20 @@ const Database::DatabaseEntry XgemmDirectDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",8}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",0}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
}
@ -116,18 +122,20 @@ const Database::DatabaseEntry XgemmDirectComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "Tonga", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // NVIDIA GPUs
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "default", { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
{ "GeForce GTX TITAN Black", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",8} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
{ "default", { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
}
},
}

View File

@ -43,6 +43,7 @@ const Database::DatabaseEntry XgemvSingle = {
{ "Oland", { {"WGS1",128}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",128}, {"WPT1",2} } },
{ "default", { {"WGS1",128}, {"WPT1",1} } },
}
},
@ -82,6 +83,7 @@ const Database::DatabaseEntry XgemvSingle = {
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN Black", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",256}, {"WPT1",1} } },
{ "Tesla K20m", { {"WGS1",128}, {"WPT1",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } },
@ -107,6 +109,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
{ "Oland", { {"WGS1",64}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",64}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",64}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
@ -145,6 +148,7 @@ const Database::DatabaseEntry XgemvComplexSingle = {
{ "GeForce GTX 750", { {"WGS1",128}, {"WPT1",1} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},
@ -167,6 +171,7 @@ const Database::DatabaseEntry XgemvDouble = {
{ "Oland", { {"WGS1",256}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",32}, {"WPT1",1} } },
{ "default", { {"WGS1",256}, {"WPT1",1} } },
}
},
@ -194,6 +199,7 @@ const Database::DatabaseEntry XgemvDouble = {
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX 980", { {"WGS1",64}, {"WPT1",1} } },
{ "GeForce GTX TITAN", { {"WGS1",256}, {"WPT1",1} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WPT1",1} } },
{ "GeForce GTX TITAN X", { {"WGS1",64}, {"WPT1",1} } },
{ "Tesla K20m", { {"WGS1",256}, {"WPT1",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1} } },
@ -219,6 +225,7 @@ const Database::DatabaseEntry XgemvComplexDouble = {
{ "Oland", { {"WGS1",256}, {"WPT1",1} } },
{ "Pitcairn", { {"WGS1",256}, {"WPT1",1} } },
{ "Tahiti", { {"WGS1",256}, {"WPT1",1} } },
{ "Tonga", { {"WGS1",64}, {"WPT1",1} } },
{ "default", { {"WGS1",64}, {"WPT1",1} } },
}
},

View File

@ -43,6 +43,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tonga", { {"VW2",1}, {"WGS2",16}, {"WPT2",4} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@ -82,6 +83,7 @@ const Database::DatabaseEntry XgemvFastSingle = {
{ "GeForce GTX 750 Ti", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN Black", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tesla K20m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -107,6 +109,7 @@ const Database::DatabaseEntry XgemvFastComplexSingle = {
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tonga", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@ -164,6 +167,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
{ "Oland", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tonga", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},
@ -191,6 +195,7 @@ const Database::DatabaseEntry XgemvFastDouble = {
{ "GeForce GTX 750 Ti", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
{ "GeForce GTX 980", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "GeForce GTX TITAN", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN Black", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "GeForce GTX TITAN X", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tesla K20m", { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
{ "Tesla K40m", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
@ -216,6 +221,7 @@ const Database::DatabaseEntry XgemvFastComplexDouble = {
{ "Oland", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
{ "Pitcairn", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tahiti", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
{ "Tonga", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
{ "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
}
},

View File

@ -32,7 +32,8 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
{ "Tonga", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
{ "default", { {"VW3",8}, {"WGS3",128}, {"WPT3",32} } },
}
},
{ // Intel CPUs
@ -55,6 +56,7 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "GeForce GTX TITAN Black", { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
}
},
@ -73,7 +75,8 @@ const Database::DatabaseEntry XgemvFastRotComplexSingle = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
{ "Tonga", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
}
},
{ // Intel CPUs
@ -107,6 +110,7 @@ const Database::DatabaseEntry XgemvFastRotDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "Tonga", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
}
},
@ -120,6 +124,7 @@ const Database::DatabaseEntry XgemvFastRotDouble = {
kDeviceTypeGPU, "NVIDIA", {
{ "GeForce GTX 750 Ti", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "GeForce GTX TITAN", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "GeForce GTX TITAN Black", { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
}
},
@ -138,7 +143,8 @@ const Database::DatabaseEntry XgemvFastRotComplexDouble = {
{ // AMD GPUs
kDeviceTypeGPU, "AMD", {
{ "AMD Radeon R9 M370X Compute Engine", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "default", { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
{ "Tonga", { {"VW3",4}, {"WGS3",16}, {"WPT3",8} } },
{ "default", { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
}
},
{ // Intel CPUs

View File

@ -43,7 +43,8 @@ const Database::DatabaseEntry XgerSingle = {
{ "Oland", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "Tonga", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
{ "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -80,6 +81,7 @@ const Database::DatabaseEntry XgerSingle = {
{ "GeForce GTX 750", { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
{ "GeForce GTX 750 Ti", { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
}
},
@ -102,7 +104,8 @@ const Database::DatabaseEntry XgerComplexSingle = {
{ "Oland", { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
{ "Pitcairn", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
{ "Tonga", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -139,12 +142,13 @@ const Database::DatabaseEntry XgerComplexSingle = {
{ "GeForce GTX 750", { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
},
}
@ -161,7 +165,8 @@ const Database::DatabaseEntry XgerDouble = {
{ "Oland", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
{ "Tonga", { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
}
},
{ // ARM GPUs
@ -187,6 +192,7 @@ const Database::DatabaseEntry XgerDouble = {
{ "GeForce GTX 750", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
{ "GeForce GTX TITAN", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "default", { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
}
},
@ -209,6 +215,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
{ "Oland", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "Pitcairn", { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
{ "Tahiti", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
{ "Tonga", { {"WGS1",16}, {"WGS2",4}, {"WPT",1} } },
{ "default", { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
}
},
@ -235,6 +242,7 @@ const Database::DatabaseEntry XgerComplexDouble = {
{ "GeForce GTX 750", { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
{ "GeForce GTX 750 Ti", { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
{ "GeForce GTX TITAN", { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
{ "GeForce GTX TITAN Black", { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
{ "default", { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
}
},

View File

@ -113,7 +113,7 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
const real beta = GetRealArg(arg_beta);
// Skip these threads if they do not contain threads contributing to the upper-triangle
if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
if ((GetGroupID1() + 1)*NWG < GetGroupID0()*MWG) {
return;
}
@ -153,7 +153,7 @@ void XgemmLower(const int kSizeN, const int kSizeK,
const real beta = GetRealArg(arg_beta);
// Skip these threads if they do not contain threads contributing to the lower-triangle
if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) {
return;
}

View File

@ -21,10 +21,11 @@
namespace clblast {
// =================================================================================================
// Constructor: not much here, because no status codes can be returned
// The constructor does all heavy work, errors are returned as exceptions
Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
const std::vector<const Database::DatabaseEntry*> &userDatabase):
const std::vector<const Database::DatabaseEntry*> &userDatabase,
std::initializer_list<const char *> source):
precision_(precision),
routine_name_(name),
queue_(queue),
@ -33,15 +34,9 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
device_(queue_.GetDevice()),
device_name_(device_.Name()),
db_(queue_, routines, precision_, userDatabase) {
}
// =================================================================================================
// Separate set-up function to allow for status codes to be returned
StatusCode Routine::SetUp() {
// Queries the cache to see whether or not the program (context-specific) is already there
if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
if (ProgramIsInCache(context_, precision_, routine_name_)) { return; }
// Sets the build options from an environmental variable (if set)
auto options = std::vector<std::string>();
@ -53,13 +48,10 @@ StatusCode Routine::SetUp() {
// Queries the cache to see whether or not the binary (device-specific) is already there. If it
// is, a program is created and stored in the cache
if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
try {
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
program.Build(device_, options);
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
return StatusCode::kSuccess;
auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
auto program = Program(device_, context_, binary);
program.Build(device_, options);
StoreProgramToCache(program, context_, precision_, routine_name_);
}
// Otherwise, the kernel will be compiled and program will be built. Both the binary and the
@ -69,48 +61,50 @@ StatusCode Routine::SetUp() {
const auto extensions = device_.Capabilities();
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
return StatusCode::kNoDoublePrecision;
throw RuntimeErrorCode(StatusCode::kNoDoublePrecision);
}
}
// As above, but for cl_khr_fp16 (half precision)
if (precision_ == Precision::kHalf) {
if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
return StatusCode::kNoHalfPrecision;
throw RuntimeErrorCode(StatusCode::kNoHalfPrecision);
}
}
// Loads the common header (typedefs and defines and such)
std::string common_header =
#include "kernels/common.opencl"
;
// Collects the parameters for this device in the form of defines, and adds the precision
auto defines = db_.GetDefines();
defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
auto source_string = db_.GetDefines();
source_string += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
// Adds the name of the routine as a define
defines += "#define ROUTINE_"+routine_name_+"\n";
source_string += "#define ROUTINE_"+routine_name_+"\n";
// For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
if (device_.IsAMD() && device_.IsGPU()) {
defines += "#define USE_CL_MAD 1\n";
source_string += "#define USE_CL_MAD 1\n";
}
// For specific devices, use staggered/shuffled workgroup indices.
if (device_.IsAMD() && device_.IsGPU()) {
defines += "#define USE_STAGGERED_INDICES 1\n";
source_string += "#define USE_STAGGERED_INDICES 1\n";
}
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
if (device_.IsARM() && device_.IsGPU()) {
defines += "#define GLOBAL_MEM_FENCE 1\n";
source_string += "#define GLOBAL_MEM_FENCE 1\n";
}
// Combines everything together into a single source string
const auto source_string = defines + common_header + source_string_;
// Loads the common header (typedefs and defines and such)
source_string +=
#include "kernels/common.opencl"
;
// Adds routine-specific code to the constructed source string
for (const char *s: source) {
source_string += s;
}
// Prints details of the routine to compile in case of debugging in verbose mode
#ifdef VERBOSE
@ -120,23 +114,21 @@ StatusCode Routine::SetUp() {
#endif
// Compiles the kernel
auto program = Program(context_, source_string);
try {
auto program = Program(context_, source_string);
const auto build_status = program.Build(device_, options);
// Checks for compiler crashes/errors/warnings
if (build_status == BuildStatus::kError) {
const auto message = program.GetBuildInfo(device_);
fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
return StatusCode::kBuildProgramFailure;
program.Build(device_, options);
} catch (const CLError &e) {
if (e.status() == CL_BUILD_PROGRAM_FAILURE) {
fprintf(stdout, "OpenCL compiler error/warning: %s\n",
program.GetBuildInfo(device_).c_str());
}
if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
throw;
}
// Store the compiled binary and program in the cache
const auto binary = program.GetIR();
StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
StoreProgramToCache(program, context_, precision_, routine_name_);
} catch (...) { return StatusCode::kBuildProgramFailure; }
// Store the compiled binary and program in the cache
const auto binary = program.GetIR();
StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
StoreProgramToCache(program, context_, precision_, routine_name_);
// Prints the elapsed compilation time in case of debugging in verbose mode
#ifdef VERBOSE
@ -144,9 +136,6 @@ StatusCode Routine::SetUp() {
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
#endif
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// =================================================================================================

View File

@ -19,9 +19,9 @@
#include <string>
#include <vector>
#include "utilities.hpp"
#include "utilities/utilities.hpp"
#include "cache.hpp"
#include "buffer_test.hpp"
#include "utilities/buffer_test.hpp"
#include "database/database.hpp"
#include "routines/common.hpp"
@ -34,21 +34,19 @@ class Routine {
// Base class constructor. The user database is an optional extra database to override the
// built-in database.
// All heavy preparation work is done inside this constructor.
explicit Routine(Queue &queue, EventPointer event, const std::string &name,
const std::vector<std::string> &routines, const Precision precision,
const std::vector<const Database::DatabaseEntry*> &userDatabase = {});
// Set-up phase of the kernel
StatusCode SetUp();
const std::vector<const Database::DatabaseEntry*> &userDatabase,
std::initializer_list<const char *> source);
protected:
// Non-static variable for the precision
const Precision precision_;
// The routine's name and its kernel-source in string form
// The routine's name
const std::string routine_name_;
std::string source_string_;
// The OpenCL objects, accessible only from derived classes
Queue queue_;

View File

@ -20,22 +20,26 @@ namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents) {
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents) {
if (!local.empty()) {
// Tests for validity of the local thread sizes
if (local.size() > device.MaxWorkItemDimensions()) {
return StatusCode::kInvalidLocalNumDimensions;
throw RuntimeErrorCode(StatusCode::kInvalidLocalNumDimensions);
}
const auto max_work_item_sizes = device.MaxWorkItemSizes();
for (auto i=size_t{0}; i<local.size(); ++i) {
if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
if (local[i] > max_work_item_sizes[i]) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsDim);
}
}
auto local_size = size_t{1};
for (auto &item: local) { local_size *= item; }
if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
if (local_size > device.MaxWorkGroupSize()) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalThreadsTotal);
}
// Make sure the global thread sizes are at least equal to the local sizes
for (auto i=size_t{0}; i<global.size(); ++i) {
@ -45,7 +49,9 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Tests for local memory usage
const auto local_mem_usage = kernel.LocalMemUsage(device);
if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
if (!device.IsLocalMemoryValid(local_mem_usage)) {
throw RuntimeErrorCode(StatusCode::kInvalidLocalMemUsage);
}
// Prints the name of the kernel to launch in case of debugging in verbose mode
#ifdef VERBOSE
@ -55,9 +61,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
#endif
// Launches the kernel (and checks for launch errors)
try {
kernel.Launch(queue, global, local, event, waitForEvents);
} catch (...) { return StatusCode::kKernelLaunchError; }
kernel.Launch(queue, global, local, event, waitForEvents);
// Prints the elapsed execution time in case of debugging in verbose mode
#ifdef VERBOSE
@ -66,9 +70,6 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
#endif
// No errors, normal termination of this function
return StatusCode::kSuccess;
}
// =================================================================================================

View File

@ -27,29 +27,29 @@ namespace clblast {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents = {});
void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
std::vector<size_t> global, const std::vector<size_t> &local,
EventPointer event, const std::vector<Event> &waitForEvents = {});
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
// to write to symmetric and triangular matrices through optional arguments.
template <typename T>
StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
const Database &db,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false) {
void PadCopyTransposeMatrix(Queue &queue, const Device &device,
const Database &db,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t src_one, const size_t src_two,
const size_t src_ld, const size_t src_offset,
const Buffer<T> &src,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
@ -61,8 +61,8 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
if (do_transpose) {
if (use_fast_kernel &&
IsMultiple(src_ld, db["TRA_WPT"]) &&
IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
IsMultiple(src_one, db["TRA_WPT"]*db["TRA_DIM"]) &&
IsMultiple(src_two, db["TRA_WPT"]*db["TRA_DIM"])) {
kernel_name = "TransposeMatrixFast";
}
else {
@ -84,77 +84,75 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
// Retrieves the kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(src_ld));
kernel.SetArgument(1, src());
kernel.SetArgument(2, dest());
kernel.SetArgument(3, GetRealArg(alpha));
}
else {
kernel.SetArgument(0, static_cast<int>(src_one));
kernel.SetArgument(1, static_cast<int>(src_two));
kernel.SetArgument(2, static_cast<int>(src_ld));
kernel.SetArgument(3, static_cast<int>(src_offset));
kernel.SetArgument(4, src());
kernel.SetArgument(5, static_cast<int>(dest_one));
kernel.SetArgument(6, static_cast<int>(dest_two));
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, GetRealArg(alpha));
if (do_pad) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(11, static_cast<int>(upper));
kernel.SetArgument(12, static_cast<int>(lower));
kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
// parameters in the database.
if (do_transpose) {
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(src_ld));
kernel.SetArgument(1, src());
kernel.SetArgument(2, dest());
kernel.SetArgument(3, GetRealArg(alpha));
const auto global = std::vector<size_t>{
dest_one / db["TRA_WPT"],
dest_two / db["TRA_WPT"]
};
const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
kernel.SetArgument(0, static_cast<int>(src_one));
kernel.SetArgument(1, static_cast<int>(src_two));
kernel.SetArgument(2, static_cast<int>(src_ld));
kernel.SetArgument(3, static_cast<int>(src_offset));
kernel.SetArgument(4, src());
kernel.SetArgument(5, static_cast<int>(dest_one));
kernel.SetArgument(6, static_cast<int>(dest_two));
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, GetRealArg(alpha));
if (do_pad) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
kernel.SetArgument(11, static_cast<int>(upper));
kernel.SetArgument(12, static_cast<int>(lower));
kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
};
const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
// parameters in the database.
if (do_transpose) {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["TRA_WPT"],
dest_two / db["TRA_WPT"]
};
const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
};
const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
}
else {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["COPY_VW"],
dest_two / db["COPY_WPT"]
};
const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
if (use_fast_kernel) {
const auto global = std::vector<size_t>{
dest_one / db["COPY_VW"],
dest_two / db["COPY_WPT"]
};
const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
else {
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
};
const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
const auto global = std::vector<size_t>{
Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
};
const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
} catch (...) { return StatusCode::kInvalidKernel; }
}
}
// =================================================================================================

View File

@ -22,74 +22,64 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xamax.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xamax<T>::DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xamax<T>::DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorIndex(1, imax_buffer, imax_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorIndex(1, imax_buffer, imax_offset);
// Retrieves the Xamax kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xamax");
auto kernel2 = Kernel(program, "XamaxEpilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xamax");
auto kernel2 = Kernel(program, "XamaxEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer1 = Buffer<T>(context_, temp_size);
auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer1 = Buffer<T>(context_, temp_size);
auto temp_buffer2 = Buffer<unsigned int>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer1());
kernel1.SetArgument(5, temp_buffer2());
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer1());
kernel1.SetArgument(5, temp_buffer2());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer1());
kernel2.SetArgument(1, temp_buffer2());
kernel2.SetArgument(2, imax_buffer());
kernel2.SetArgument(3, static_cast<int>(imax_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer1());
kernel2.SetArgument(1, temp_buffer2());
kernel2.SetArgument(2, imax_buffer());
kernel2.SetArgument(3, static_cast<int>(imax_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xamax: public Routine {
Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
// Templated-precision implementation of the routine
StatusCode DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoAmax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -22,71 +22,61 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xasum.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xasum<T>::DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xasum<T>::DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorScalar(1, asum_buffer, asum_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorScalar(1, asum_buffer, asum_offset);
// Retrieves the Xasum kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xasum");
auto kernel2 = Kernel(program, "XasumEpilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xasum");
auto kernel2 = Kernel(program, "XasumEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, asum_buffer());
kernel2.SetArgument(2, static_cast<int>(asum_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, asum_buffer());
kernel2.SetArgument(2, static_cast<int>(asum_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xasum: public Routine {
Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
// Templated-precision implementation of the routine
StatusCode DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoAsum(const size_t n,
const Buffer<T> &asum_buffer, const size_t asum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xaxpy.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,45 +52,39 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
// Retrieves the Xaxpy kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xaxpy: public Routine {
Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
// Templated-precision implementation of the routine
StatusCode DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoAxpy(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xcopy.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xcopy<T>::DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xcopy<T>::DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +52,37 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy";
// Retrieves the Xcopy kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xcopy: public Routine {
Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
// Templated-precision implementation of the routine
StatusCode DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoCopy(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -22,79 +22,68 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xdot.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xdot<T>::DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate) {
void Xdot<T>::DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorScalar(1, dot_buffer, dot_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
TestVectorScalar(1, dot_buffer, dot_offset);
// Retrieves the Xdot kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xdot");
auto kernel2 = Kernel(program, "XdotEpilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xdot");
auto kernel2 = Kernel(program, "XdotEpilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, y_buffer());
kernel1.SetArgument(5, static_cast<int>(y_offset));
kernel1.SetArgument(6, static_cast<int>(y_inc));
kernel1.SetArgument(7, temp_buffer());
kernel1.SetArgument(8, static_cast<int>(do_conjugate));
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, y_buffer());
kernel1.SetArgument(5, static_cast<int>(y_offset));
kernel1.SetArgument(6, static_cast<int>(y_inc));
kernel1.SetArgument(7, temp_buffer());
kernel1.SetArgument(8, static_cast<int>(do_conjugate));
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, dot_buffer());
kernel2.SetArgument(2, static_cast<int>(dot_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, dot_buffer());
kernel2.SetArgument(2, static_cast<int>(dot_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,11 +28,11 @@ class Xdot: public Routine {
Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
// Templated-precision implementation of the routine
StatusCode DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate = false);
void DoDot(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const bool do_conjugate = false);
};
// =================================================================================================

View File

@ -29,14 +29,14 @@ Xdotc<T>::Xdotc(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xdotc<T>::DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
return DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
true);
void Xdotc<T>::DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
true);
}
// =================================================================================================

View File

@ -31,10 +31,10 @@ class Xdotc: public Xdot<T> {
Xdotc(Queue &queue, EventPointer event, const std::string &name = "DOTC");
// Templated-precision implementation of the routine
StatusCode DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoDotc(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -28,14 +28,14 @@ Xdotu<T>::Xdotu(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xdotu<T>::DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
return DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
false);
void Xdotu<T>::DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
DoDot(n, dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
false);
}
// =================================================================================================

View File

@ -31,10 +31,10 @@ class Xdotu: public Xdot<T> {
Xdotu(Queue &queue, EventPointer event, const std::string &name = "DOTU");
// Templated-precision implementation of the routine
StatusCode DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoDotu(const size_t n,
const Buffer<T> &dot_buffer, const size_t dot_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -35,10 +35,10 @@ class Xmax: public Xamax<T> {
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoMax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
void DoMax(const size_t n,
const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
DoAmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc);
}
};

View File

@ -35,10 +35,10 @@ class Xmin: public Xamax<T> {
// Forwards to the regular max-absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoMin(const size_t n,
const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
void DoMin(const size_t n,
const Buffer<unsigned int> &imin_buffer, const size_t imin_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
DoAmax(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc);
}
};

View File

@ -22,71 +22,61 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/xnrm2.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xnrm2<T>::DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xnrm2<T>::DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorScalar(1, nrm2_buffer, nrm2_offset);
// Retrieves the Xnrm2 kernels from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xnrm2");
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel1 = Kernel(program, "Xnrm2");
auto kernel2 = Kernel(program, "Xnrm2Epilogue");
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Creates the buffer for intermediate values
auto temp_size = 2*db_["WGS2"];
auto temp_buffer = Buffer<T>(context_, temp_size);
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Sets the kernel arguments
kernel1.SetArgument(0, static_cast<int>(n));
kernel1.SetArgument(1, x_buffer());
kernel1.SetArgument(2, static_cast<int>(x_offset));
kernel1.SetArgument(3, static_cast<int>(x_inc));
kernel1.SetArgument(4, temp_buffer());
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Event waiting list
auto eventWaitList = std::vector<Event>();
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(kernelEvent);
// Launches the main kernel
auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
auto local1 = std::vector<size_t>{db_["WGS1"]};
auto kernelEvent = Event();
RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
eventWaitList.push_back(kernelEvent);
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, nrm2_buffer());
kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
// Sets the arguments for the epilogue kernel
kernel2.SetArgument(0, temp_buffer());
kernel2.SetArgument(1, nrm2_buffer());
kernel2.SetArgument(2, static_cast<int>(nrm2_offset));
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the epilogue kernel
auto global2 = std::vector<size_t>{db_["WGS2"]};
auto local2 = std::vector<size_t>{db_["WGS2"]};
RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xnrm2: public Routine {
Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
// Templated-precision implementation of the routine
StatusCode DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoNrm2(const size_t n,
const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -22,26 +22,24 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xscal.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
void Xscal<T>::DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vector for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -51,41 +49,35 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal";
// Retrieves the Xscal kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,8 +28,8 @@ class Xscal: public Routine {
Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
// Templated-precision implementation of the routine
StatusCode DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
void DoScal(const size_t n, const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
};
// =================================================================================================

View File

@ -35,10 +35,10 @@ class Xsum: public Xasum<T> {
// Forwards to the regular absolute version. The implementation difference is realised in the
// kernel through a pre-processor macro based on the name of the routine.
StatusCode DoSum(const size_t n,
const Buffer<T> &sum_buffer, const size_t sum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
return DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
void DoSum(const size_t n,
const Buffer<T> &sum_buffer, const size_t sum_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
DoAsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc);
}
};

View File

@ -22,29 +22,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level1/level1.opencl"
#include "../../kernels/level1/xswap.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xswap<T>::DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xswap<T>::DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the vectors for validity
auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
@ -55,43 +52,37 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap";
// Retrieves the Xswap kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, x_buffer());
kernel.SetArgument(2, static_cast<int>(x_offset));
kernel.SetArgument(3, static_cast<int>(x_inc));
kernel.SetArgument(4, y_buffer());
kernel.SetArgument(5, static_cast<int>(y_offset));
kernel.SetArgument(6, static_cast<int>(y_inc));
}
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
}
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
if (use_fast_kernel) {
auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
else {
auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
auto local = std::vector<size_t>{db_["WGS"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
}
// =================================================================================================

View File

@ -28,9 +28,9 @@ class Xswap: public Routine {
Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
// Templated-precision implementation of the routine
StatusCode DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoSwap(const size_t n,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xgbmv<T>::Xgbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Reverses the upper and lower band count
auto rotated = (layout == Layout::kRowMajor);
@ -46,13 +46,13 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
// The specific hermitian matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_GBMV define.
bool fast_kernels = false;
return MatVec(layout, a_transpose,
m, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
0, false, kl_real, ku_real);
MatVec(layout, a_transpose,
m, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
0, false, kl_real, ku_real);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xgbmv: public Xgemv<T> {
Xgbmv(Queue &queue, EventPointer event, const std::string &name = "GBMV");
// Templated-precision implementation of the routine
StatusCode DoGbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoGbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -22,52 +22,51 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/xgemv.opencl"
#include "../../kernels/level2/xgemv_fast.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Performs the matrix-vector multiplication
return MatVec(layout, a_transpose,
m, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
true, true,
0, false, 0, 0); // N/A for this routine
MatVec(layout, a_transpose,
m, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
true, true,
0, false, 0, 0); // N/A for this routine
}
// =================================================================================================
// The generic implementation, also suited for other (non general) matrix-vector multiplications
template <typename T>
StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
bool fast_kernel, bool fast_kernel_rot,
const size_t parameter, const bool packed,
const size_t kl, const size_t ku) {
void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
bool fast_kernel, bool fast_kernel_rot,
const size_t parameter, const bool packed,
const size_t kl, const size_t ku) {
// Makes sure all dimensions are larger than zero
if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrix has an alternative layout (row or column-major).
auto a_altlayout = (layout == Layout::kRowMajor);
@ -91,14 +90,10 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
auto a_conjugate = (a_transpose == Transpose::kConjugate);
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
else { TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
TestVectorX(n_real, x_buffer, x_offset, x_inc);
TestVectorY(m_real, y_buffer, y_offset, y_inc);
// Determines whether or not the fast-version can be used
fast_kernel = fast_kernel && (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
@ -127,39 +122,33 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
}
// Retrieves the Xgemv kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_real));
kernel.SetArgument(1, static_cast<int>(n_real));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, static_cast<int>(a_rotated));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, x_buffer());
kernel.SetArgument(9, static_cast<int>(x_offset));
kernel.SetArgument(10, static_cast<int>(x_inc));
kernel.SetArgument(11, y_buffer());
kernel.SetArgument(12, static_cast<int>(y_offset));
kernel.SetArgument(13, static_cast<int>(y_inc));
kernel.SetArgument(14, static_cast<int>(a_conjugate));
kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_real));
kernel.SetArgument(1, static_cast<int>(n_real));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, GetRealArg(beta));
kernel.SetArgument(4, static_cast<int>(a_rotated));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, x_buffer());
kernel.SetArgument(9, static_cast<int>(x_offset));
kernel.SetArgument(10, static_cast<int>(x_inc));
kernel.SetArgument(11, y_buffer());
kernel.SetArgument(12, static_cast<int>(y_offset));
kernel.SetArgument(13, static_cast<int>(y_inc));
kernel.SetArgument(14, static_cast<int>(a_conjugate));
kernel.SetArgument(15, static_cast<int>(parameter)); // extra parameter used for symm/herm
kernel.SetArgument(16, static_cast<int>(kl)); // only used for banded matrices
kernel.SetArgument(17, static_cast<int>(ku)); // only used for banded matrices
// Launches the kernel
auto global = std::vector<size_t>{global_size};
auto local = std::vector<size_t>{local_size};
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
auto global = std::vector<size_t>{global_size};
auto local = std::vector<size_t>{local_size};
RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================

View File

@ -28,25 +28,25 @@ class Xgemv: public Routine {
Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
// Templated-precision implementation of the routine
StatusCode DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
// Generic version used also for other matrix-vector multiplications
StatusCode MatVec(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
bool fast_kernel, bool fast_kernel_rot,
const size_t parameter, const bool packed,
const size_t kl, const size_t ku);
void MatVec(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
bool fast_kernel, bool fast_kernel_rot,
const size_t parameter, const bool packed,
const size_t kl, const size_t ku);
};
// =================================================================================================

View File

@ -22,26 +22,25 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xger.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xger<T>::DoGer(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
void Xger<T>::DoGer(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Makes sure all dimensions are larger than zero
if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
if (m == 0 || n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes whether or not the matrix has an alternative layout (row or column-major).
const auto a_is_rowmajor = (layout == Layout::kRowMajor);
@ -49,44 +48,35 @@ StatusCode Xger<T>::DoGer(const Layout layout,
const auto a_two = (a_is_rowmajor) ? m : n;
// Tests the matrix and the vectors for validity
auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
if (ErrorIn(status)) { return status; }
status = TestVectorX(m, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
TestVectorX(m, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xger");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xger");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(a_one));
kernel.SetArgument(1, static_cast<int>(a_two));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, x_buffer());
kernel.SetArgument(4, static_cast<int>(x_offset));
kernel.SetArgument(5, static_cast<int>(x_inc));
kernel.SetArgument(6, y_buffer());
kernel.SetArgument(7, static_cast<int>(y_offset));
kernel.SetArgument(8, static_cast<int>(y_inc));
kernel.SetArgument(9, a_buffer());
kernel.SetArgument(10, static_cast<int>(a_offset));
kernel.SetArgument(11, static_cast<int>(a_ld));
kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(a_one));
kernel.SetArgument(1, static_cast<int>(a_two));
kernel.SetArgument(2, GetRealArg(alpha));
kernel.SetArgument(3, x_buffer());
kernel.SetArgument(4, static_cast<int>(x_offset));
kernel.SetArgument(5, static_cast<int>(x_inc));
kernel.SetArgument(6, y_buffer());
kernel.SetArgument(7, static_cast<int>(y_offset));
kernel.SetArgument(8, static_cast<int>(y_inc));
kernel.SetArgument(9, a_buffer());
kernel.SetArgument(10, static_cast<int>(a_offset));
kernel.SetArgument(11, static_cast<int>(a_ld));
kernel.SetArgument(12, static_cast<int>(a_is_rowmajor));
// Launches the kernel
auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
auto a_one_ceiled = Ceil(CeilDiv(a_one, db_["WPT"]), db_["WGS1"]);
auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================

View File

@ -28,12 +28,12 @@ class Xger: public Routine {
Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
// Templated-precision implementation of the routine
StatusCode DoGer(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
void DoGer(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================

View File

@ -28,19 +28,19 @@ Xgerc<T>::Xgerc(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xgerc<T>::DoGerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
void Xgerc<T>::DoGerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Regular Ger operation on complex data, plus conjugation in the kernel guarded by the
// ROUTINE_GERC guard.
return DoGer(layout, m, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
DoGer(layout, m, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xgerc: public Xger<T> {
Xgerc(Queue &queue, EventPointer event, const std::string &name = "GERC");
// Templated-precision implementation of the routine
StatusCode DoGerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
void DoGerc(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================

View File

@ -28,18 +28,18 @@ Xgeru<T>::Xgeru(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xgeru<T>::DoGeru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
void Xgeru<T>::DoGeru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld) {
// Regular Ger operation on complex data
return DoGer(layout, m, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
DoGer(layout, m, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld);
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xgeru: public Xger<T> {
Xgeru(Queue &queue, EventPointer event, const std::string &name = "GERU");
// Templated-precision implementation of the routine
StatusCode DoGeru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
void DoGeru(const Layout layout,
const size_t m, const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xhbmv<T>::Xhbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhbmv<T>::DoHbmv(const Layout layout, const Triangle triangle,
// The specific hermitian banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HBMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, k, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, k, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xhbmv: public Xgemv<T> {
Xhbmv(Queue &queue, EventPointer event, const std::string &name = "HBMV");
// Templated-precision implementation of the routine
StatusCode DoHbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoHbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xhemv<T>::Xhemv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhemv<T>::DoHemv(const Layout layout, const Triangle triangle,
// The specific hermitian matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HEMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, 0, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, 0, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xhemv: public Xgemv<T> {
Xhemv(Queue &queue, EventPointer event, const std::string &name = "HEMV");
// Templated-precision implementation of the routine
StatusCode DoHemv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoHemv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -21,11 +21,10 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T, typename U>
Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher.opencl"
;
}) {
}
// =================================================================================================
@ -41,15 +40,15 @@ template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }
// The main routine
template <typename T, typename U>
StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed) {
void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed) {
// Makes sure the dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -57,47 +56,38 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
TestVectorX(n, x_buffer, x_offset, x_inc);
// If alpha is zero an update is not required
if (alpha == U{0}) { return StatusCode::kSuccess; }
if (alpha == U{0}) { return; }
// Creates a matching version of alpha
const auto matching_alpha = GetAlpha(alpha);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(matching_alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, static_cast<int>(is_upper));
kernel.SetArgument(9, static_cast<int>(is_rowmajor));
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(matching_alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
kernel.SetArgument(7, static_cast<int>(a_ld));
kernel.SetArgument(8, static_cast<int>(is_upper));
kernel.SetArgument(9, static_cast<int>(is_rowmajor));
// Launches the kernel
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xher: public Routine {
T GetAlpha(const U alpha);
// Templated-precision implementation of the routine
StatusCode DoHer(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
void DoHer(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
};
// =================================================================================================

View File

@ -21,27 +21,26 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
source_string_ =
Routine(queue, event, name, {"Xger"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/level2.opencl"
#include "../../kernels/level2/xher2.opencl"
;
}) {
}
// =================================================================================================
// The main routine
template <typename T>
StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed) {
void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed) {
// Makes sure the dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// The data is either in the upper or lower triangle
const auto is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -49,46 +48,36 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
if (ErrorIn(status)) { return status; }
status = TestVectorX(n, x_buffer, x_offset, x_inc);
if (ErrorIn(status)) { return status; }
status = TestVectorY(n, y_buffer, y_offset, y_inc);
if (ErrorIn(status)) { return status; }
if (packed) { TestMatrixAP(n, a_buffer, a_offset); }
else { TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
TestVectorX(n, x_buffer, x_offset, x_inc);
TestVectorY(n, y_buffer, y_offset, y_inc);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher2");
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
auto kernel = Kernel(program, "Xher2");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
kernel.SetArgument(8, a_buffer());
kernel.SetArgument(9, static_cast<int>(a_offset));
kernel.SetArgument(10, static_cast<int>(a_ld));
kernel.SetArgument(11, static_cast<int>(is_upper));
kernel.SetArgument(12, static_cast<int>(is_rowmajor));
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, GetRealArg(alpha));
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
kernel.SetArgument(5, y_buffer());
kernel.SetArgument(6, static_cast<int>(y_offset));
kernel.SetArgument(7, static_cast<int>(y_inc));
kernel.SetArgument(8, a_buffer());
kernel.SetArgument(9, static_cast<int>(a_offset));
kernel.SetArgument(10, static_cast<int>(a_ld));
kernel.SetArgument(11, static_cast<int>(is_upper));
kernel.SetArgument(12, static_cast<int>(is_rowmajor));
// Launches the kernel
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
status = RunKernel(kernel, queue_, device_, global, local, event_);
if (ErrorIn(status)) { return status; }
// Succesfully finished the computation
return StatusCode::kSuccess;
} catch (...) { return StatusCode::kInvalidKernel; }
// Launches the kernel
auto global_one = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS1"]);
auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
auto global = std::vector<size_t>{global_one, global_two};
auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
RunKernel(kernel, queue_, device_, global, local, event_);
}
// =================================================================================================

View File

@ -28,13 +28,13 @@ class Xher2: public Routine {
Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
// Templated-precision implementation of the routine
StatusCode DoHer2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
void DoHer2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const bool packed = false);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xhpmv<T>::Xhpmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xhpmv<T>::DoHpmv(const Layout layout, const Triangle triangle,
// The specific hermitian packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_HPMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
ap_buffer, ap_offset, n,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, true, 0, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
ap_buffer, ap_offset, n,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, true, 0, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xhpmv: public Xgemv<T> {
Xhpmv(Queue &queue, EventPointer event, const std::string &name = "HPMV");
// Templated-precision implementation of the routine
StatusCode DoHpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoHpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -28,17 +28,17 @@ Xhpr<T,U>::Xhpr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T, typename U>
StatusCode Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
void Xhpr<T,U>::DoHpr(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xhpr functionality is implemented in the kernel using defines
return DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================

View File

@ -31,11 +31,11 @@ class Xhpr: public Xher<T,U> {
Xhpr(Queue &queue, EventPointer event, const std::string &name = "HPR");
// Templated-precision implementation of the routine
StatusCode DoHpr(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
void DoHpr(const Layout layout, const Triangle triangle,
const size_t n,
const U alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================

View File

@ -28,19 +28,19 @@ Xhpr2<T>::Xhpr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
void Xhpr2<T>::DoHpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xhpr2 functionality is implemented in the kernel using defines
return DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xhpr2: public Xher2<T> {
Xhpr2(Queue &queue, EventPointer event, const std::string &name = "HPR2");
// Templated-precision implementation of the routine
StatusCode DoHpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
void DoHpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xsbmv<T>::Xsbmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
// The specific symmetric banded matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SBMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, k, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, k, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xsbmv: public Xgemv<T> {
Xsbmv(Queue &queue, EventPointer event, const std::string &name = "SBMV");
// Templated-precision implementation of the routine
StatusCode DoSbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoSbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xspmv<T>::Xspmv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
// The specific symmetric packed matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SPMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
ap_buffer, ap_offset, n,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, true, 0, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
ap_buffer, ap_offset, n,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, true, 0, 0);
}
// =================================================================================================

View File

@ -33,13 +33,13 @@ class Xspmv: public Xgemv<T> {
Xspmv(Queue &queue, EventPointer event, const std::string &name = "SPMV");
// Templated-precision implementation of the routine
StatusCode DoSpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
void DoSpmv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &ap_buffer, const size_t ap_offset,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
};
// =================================================================================================

View File

@ -28,17 +28,17 @@ Xspr<T>::Xspr(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
void Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xspr functionality is implemented in the kernel using defines
return DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
DoHer(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================

View File

@ -31,11 +31,11 @@ class Xspr: public Xher<T,T> {
Xspr(Queue &queue, EventPointer event, const std::string &name = "SPR");
// Templated-precision implementation of the routine
StatusCode DoSpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
void DoSpr(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================

View File

@ -28,19 +28,19 @@ Xspr2<T>::Xspr2(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
void Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset) {
// Specific Xspr2 functionality is implemented in the kernel using defines
return DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
DoHer2(layout, triangle, n, alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset, n,
true); // packed matrix
}
// =================================================================================================

View File

@ -31,12 +31,12 @@ class Xspr2: public Xher2<T> {
Xspr2(Queue &queue, EventPointer event, const std::string &name = "SPR2");
// Templated-precision implementation of the routine
StatusCode DoSpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
void DoSpr2(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
const Buffer<T> &ap_buffer, const size_t ap_offset);
};
// =================================================================================================

View File

@ -29,13 +29,13 @@ Xsymv<T>::Xsymv(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
void Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
const size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// The data is either in the upper or lower triangle
size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
@ -45,13 +45,13 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
// The specific symmetric matrix-accesses are implemented in the kernel guarded by the
// ROUTINE_SYMV define.
bool fast_kernels = false;
return MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, 0, 0);
MatVec(layout, Transpose::kNo,
n, n, alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc, beta,
y_buffer, y_offset, y_inc,
fast_kernels, fast_kernels,
is_upper, false, 0, 0);
}
// =================================================================================================

Some files were not shown because too many files have changed in this diff Show More