Merge branch 'half_precision' into development

This commit is contained in:
Cedric Nugteren 2016-05-30 11:11:28 +02:00
commit 61105e3810
180 changed files with 4111 additions and 1215 deletions

View file

@ -1,6 +1,15 @@
Development version (next release)
-
- Added support for half-precision floating-point (fp16) in the library
- Added half-precision routines:
* Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN
* Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV/HGER/HSYR/HSPR/HSYR2/HSPR2
* Level-3: HGEMM/HSYMM/HSYRK/HSYR2K/HTRMM
Version 0.7.1
- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs
- Fixed a bug in the xGEMM routine related to the event incorrectly set
- Made MSVC link the run-time libraries statically
Version 0.7.1
- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs

View file

@ -125,7 +125,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
set(SAMPLE_PROGRAMS_CPP sgemm)
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm cache)
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
@ -156,6 +156,7 @@ target_link_libraries(clblast ${OPENCL_LIBRARIES})
install(TARGETS clblast DESTINATION lib)
install(FILES include/clblast.h DESTINATION include)
install(FILES include/clblast_c.h DESTINATION include)
install(FILES include/clblast_half.h DESTINATION include)
# ==================================================================================================

127
README.md
View file

@ -20,6 +20,7 @@ Use CLBlast instead of clBLAS:
* When you are still running on OpenCL 1.1 hardware.
* When you value an organized and modern C++ codebase.
* When you target Intel CPUs and GPUs or embedded devices
* When you can benefit from the increased performance of half-precision fp16 data-types.
Use CLBlast instead of cuBLAS:
@ -127,7 +128,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
cmake -DTUNERS=ON ..
Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 1.7.0 or higher). CLTune is available from GitHub.
Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 2.3.1 or higher). CLTune is available from GitHub.
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
@ -177,64 +178,70 @@ These graphs can be generated automatically on your own device. First, compile C
Supported routines
-------------
CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all.
CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all. The different data-types supported by the library are:
| Level-1 | S | D | C | Z |
| ---------|---|---|---|---|
| xSWAP | ✔ | ✔ | ✔ | ✔ |
| xSCAL | ✔ | ✔ | ✔ | ✔ |
| xCOPY | ✔ | ✔ | ✔ | ✔ |
| xAXPY | ✔ | ✔ | ✔ | ✔ |
| xDOT | ✔ | ✔ | - | - |
| xDOTU | - | - | ✔ | ✔ |
| xDOTC | - | - | ✔ | ✔ |
| xNRM2 | ✔ | ✔ | ✔ | ✔ |
| xASUM | ✔ | ✔ | ✔ | ✔ |
| IxAMAX | ✔ | ✔ | ✔ | ✔ |
* __S:__ Single-precision 32-bit floating-point (`float`).
* __D:__ Double-precision 64-bit floating-point (`double`).
* __C:__ Complex single-precision 2x32-bit floating-point (`std::complex<float>`).
* __Z:__ Complex double-precision 2x64-bit floating-point (`std::complex<double>`).
* __H:__ Half-precision 16-bit floating-point (`cl_half`). See section 'Half precision' for more information.
| Level-2 | S | D | C | Z |
| ---------|---|---|---|---|
| xGEMV | ✔ | ✔ | ✔ | ✔ |
| xGBMV | ✔ | ✔ | ✔ | ✔ |
| xHEMV | - | - | ✔ | ✔ |
| xHBMV | - | - | ✔ | ✔ |
| xHPMV | - | - | ✔ | ✔ |
| xSYMV | ✔ | ✔ | - | - |
| xSBMV | ✔ | ✔ | - | - |
| xSPMV | ✔ | ✔ | - | - |
| xTRMV | ✔ | ✔ | ✔ | ✔ |
| xTBMV | ✔ | ✔ | ✔ | ✔ |
| xTPMV | ✔ | ✔ | ✔ | ✔ |
| xGER | ✔ | ✔ | - | - |
| xGERU | - | - | ✔ | ✔ |
| xGERC | - | - | ✔ | ✔ |
| xHER | - | - | ✔ | ✔ |
| xHPR | - | - | ✔ | ✔ |
| xHER2 | - | - | ✔ | ✔ |
| xHPR2 | - | - | ✔ | ✔ |
| xSYR | ✔ | ✔ | - | - |
| xSPR | ✔ | ✔ | - | - |
| xSYR2 | ✔ | ✔ | - | - |
| xSPR2 | ✔ | ✔ | - | - |
| Level-1 | S | D | C | Z | H |
| ---------|---|---|---|---|---|
| xSWAP | ✔ | ✔ | ✔ | ✔ | ✔ |
| xSCAL | ✔ | ✔ | ✔ | ✔ | ✔ |
| xCOPY | ✔ | ✔ | ✔ | ✔ | ✔ |
| xAXPY | ✔ | ✔ | ✔ | ✔ | ✔ |
| xDOT | ✔ | ✔ | - | - | ✔ |
| xDOTU | - | - | ✔ | ✔ | - |
| xDOTC | - | - | ✔ | ✔ | - |
| xNRM2 | ✔ | ✔ | ✔ | ✔ | ✔ |
| xASUM | ✔ | ✔ | ✔ | ✔ | ✔ |
| IxAMAX | ✔ | ✔ | ✔ | ✔ | ✔ |
| Level-3 | S | D | C | Z |
| ---------|---|---|---|---|
| xGEMM | ✔ | ✔ | ✔ | ✔ |
| xSYMM | ✔ | ✔ | ✔ | ✔ |
| xHEMM | - | - | ✔ | ✔ |
| xSYRK | ✔ | ✔ | ✔ | ✔ |
| xHERK | - | - | ✔ | ✔ |
| xSYR2K | ✔ | ✔ | ✔ | ✔ |
| xHER2K | - | - | ✔ | ✔ |
| xTRMM | ✔ | ✔ | ✔ | ✔ |
| Level-2 | S | D | C | Z | H |
| ---------|---|---|---|---|---|
| xGEMV | ✔ | ✔ | ✔ | ✔ | ✔ |
| xGBMV | ✔ | ✔ | ✔ | ✔ | ✔ |
| xHEMV | - | - | ✔ | ✔ | - |
| xHBMV | - | - | ✔ | ✔ | - |
| xHPMV | - | - | ✔ | ✔ | - |
| xSYMV | ✔ | ✔ | - | - | ✔ |
| xSBMV | ✔ | ✔ | - | - | ✔ |
| xSPMV | ✔ | ✔ | - | - | ✔ |
| xTRMV | ✔ | ✔ | ✔ | ✔ | ✔ |
| xTBMV | ✔ | ✔ | ✔ | ✔ | ✔ |
| xTPMV | ✔ | ✔ | ✔ | ✔ | ✔ |
| xGER | ✔ | ✔ | - | - | ✔ |
| xGERU | - | - | ✔ | ✔ | - |
| xGERC | - | - | ✔ | ✔ | - |
| xHER | - | - | ✔ | ✔ | - |
| xHPR | - | - | ✔ | ✔ | - |
| xHER2 | - | - | ✔ | ✔ | - |
| xHPR2 | - | - | ✔ | ✔ | - |
| xSYR | ✔ | ✔ | - | - | ✔ |
| xSPR | ✔ | ✔ | - | - | ✔ |
| xSYR2 | ✔ | ✔ | - | - | ✔ |
| xSPR2 | ✔ | ✔ | - | - | ✔ |
| Level-3 | S | D | C | Z | H |
| ---------|---|---|---|---|---|
| xGEMM | ✔ | ✔ | ✔ | ✔ | ✔ |
| xSYMM | ✔ | ✔ | ✔ | ✔ | ✔ |
| xHEMM | - | - | ✔ | ✔ | - |
| xSYRK | ✔ | ✔ | ✔ | ✔ | ✔ |
| xHERK | - | - | ✔ | ✔ | - |
| xSYR2K | ✔ | ✔ | ✔ | ✔ | ✔ |
| xHER2K | - | - | ✔ | ✔ | - |
| xTRMM | ✔ | ✔ | ✔ | ✔ | ✔ |
In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care:
| Additional | S | D | C | Z |
| -----------|---|---|---|---|
| xSUM | ✔ | ✔ | ✔ | ✔ |
| IxMAX | ✔ | ✔ | ✔ | ✔ |
| IxMIN | ✔ | ✔ | ✔ | ✔ |
| Additional | S | D | C | Z | H |
| -----------|---|---|---|---|---|
| xSUM | ✔ | ✔ | ✔ | ✔ | ✔ |
| IxMAX | ✔ | ✔ | ✔ | ✔ | ✔ |
| IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ |
Some BLAS routines are not supported yet by CLBlast. They are shown in the following table:
@ -250,6 +257,19 @@ Some BLAS routines are not supported yet by CLBlast. They are shown in the follo
| xTRSM | | | | |
Half precision (fp16)
-------------
The half-precison fp16 format is a 16-bits floating-point data-type. Some OpenCL devices support the `cl_khr_fp16` extension, reducing storage and bandwidth requirements by a factor 2 compared to single-precision floating-point. In case the hardware also accelerates arithmetic on half-precision data-types, this can also greatly improve compute performance of e.g. level-3 routines such as GEMM. Devices which can benefit from this are among others Intel GPUs, ARM Mali GPUs, and NVIDIA's latest Pascal GPUs. Half-precision is in particular interest for the deep-learning community, in which convolutional neural networks can be processed much faster at a minor accuracy loss.
Since there is no half-precision data-type in C or C++, OpenCL provides the `cl_half` type for the host device. Unfortunately, internally this translates to a 16-bits integer, so computations on the host using this data-type should be avoided. For convenience, CLBlast provides the `clblast_half.h` header (C99 and C++ compatible), defining the `half` type as a short-hand to `cl_half` and the following basic functions:
* `half FloatToHalf(const float value)`: Converts a 32-bits floating-point value to a 16-bits floating-point value.
* `float HalfToFloat(const half value)`: Converts a 16-bits floating-point value to a 32-bits floating-point value.
The `/samples` folder contains examples of how to use these convencience functions when calling one of the half-precision BLAS routines.
Contributing
-------------
@ -270,6 +290,7 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
* [dividiti](http://www.dividiti.com)
* [SURFsara HPC center](http://www.surfsara.com)
Support us
-------------

View file

@ -34,6 +34,10 @@ StatusCode CLBlastZswap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHswap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to SWAP:
@ -82,6 +86,10 @@ StatusCode CLBlastZscal(const size_t n,
const cl_double2 alpha,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHscal(const size_t n,
const cl_half alpha,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to SCAL:
@ -128,6 +136,10 @@ StatusCode CLBlastZcopy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHcopy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to COPY:
@ -181,6 +193,11 @@ StatusCode CLBlastZaxpy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHaxpy(const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to AXPY:
@ -225,6 +242,11 @@ StatusCode CLBlastDdot(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHdot(const size_t n,
cl_mem dot_buffer, const size_t dot_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to DOT:
@ -371,6 +393,10 @@ StatusCode CLBlastDznrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHnrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to NRM2:
@ -420,6 +446,10 @@ StatusCode CLBlastDzasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to ASUM:
@ -469,6 +499,10 @@ StatusCode CLBlastDzsum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHsum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to SUM:
@ -518,6 +552,10 @@ StatusCode CLBlastiZamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastiHamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to AMAX:
@ -567,6 +605,10 @@ StatusCode CLBlastiZmax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastiHmax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to MAX:
@ -616,6 +658,10 @@ StatusCode CLBlastiZmin(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastiHmin(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to MIN:
@ -685,6 +731,14 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose,
const cl_double2 beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to GEMV:
@ -761,6 +815,14 @@ StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose,
const cl_double2 beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to GBMV:
@ -1000,6 +1062,14 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle,
const double beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to SYMV:
@ -1059,6 +1129,14 @@ StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle,
const double beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to SBMV:
@ -1119,6 +1197,14 @@ StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle,
const double beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem ap_buffer, const size_t ap_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to SPMV:
@ -1178,6 +1264,11 @@ StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Tran
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to TRMV:
@ -1235,6 +1326,11 @@ StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Tran
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to TBMV:
@ -1293,6 +1389,11 @@ StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Tran
const cl_mem ap_buffer, const size_t ap_offset,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem ap_buffer, const size_t ap_offset,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event)
```
Arguments to TPMV:
@ -1345,6 +1446,13 @@ StatusCode CLBlastDger(const Layout layout,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHger(const Layout layout,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event)
```
Arguments to GER:
@ -1713,6 +1821,12 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event)
```
Arguments to SYR:
@ -1762,6 +1876,12 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHspr(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event)
```
Arguments to SPR:
@ -1813,6 +1933,13 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event)
```
Arguments to SYR2:
@ -1868,6 +1995,13 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event)
```
Arguments to SPR2:
@ -1941,6 +2075,14 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const
const cl_double2 beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event)
```
Arguments to GEMM:
@ -2019,6 +2161,14 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri
const cl_double2 beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event)
```
Arguments to SYMM:
@ -2152,6 +2302,13 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran
const cl_double2 beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event)
```
Arguments to SYRK:
@ -2281,6 +2438,14 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra
const cl_double2 beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event)
```
Arguments to SYR2K:
@ -2409,6 +2574,12 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event)
```
Arguments to TRMM:

View file

@ -121,28 +121,28 @@ StatusCode Rotm(const size_t n,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event = nullptr);
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
template <typename T>
StatusCode Swap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
template <typename T>
StatusCode Scal(const size_t n,
const T alpha,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
template <typename T>
StatusCode Copy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
template <typename T>
StatusCode Axpy(const size_t n,
const T alpha,
@ -150,7 +150,7 @@ StatusCode Axpy(const size_t n,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Dot product of two vectors: SDOT/DDOT
// Dot product of two vectors: SDOT/DDOT/HDOT
template <typename T>
StatusCode Dot(const size_t n,
cl_mem dot_buffer, const size_t dot_offset,
@ -174,42 +174,42 @@ StatusCode Dotc(const size_t n,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
template <typename T>
StatusCode Nrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
template <typename T>
StatusCode Asum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
template <typename T>
StatusCode Sum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
template <typename T>
StatusCode Amax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
template <typename T>
StatusCode Max(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
template <typename T>
StatusCode Min(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
@ -220,7 +220,7 @@ StatusCode Min(const size_t n,
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
template <typename T>
StatusCode Gemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
@ -231,7 +231,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
template <typename T>
StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
@ -275,7 +275,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric matrix-vector multiplication: SSYMV/DSYMV
// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
template <typename T>
StatusCode Symv(const Layout layout, const Triangle triangle,
const size_t n,
@ -286,7 +286,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
template <typename T>
StatusCode Sbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
@ -297,7 +297,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
template <typename T>
StatusCode Spmv(const Layout layout, const Triangle triangle,
const size_t n,
@ -308,7 +308,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
template <typename T>
StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
@ -316,7 +316,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
template <typename T>
StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
@ -324,7 +324,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
template <typename T>
StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
@ -356,7 +356,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
// General rank-1 matrix update: SGER/DGER
// General rank-1 matrix update: SGER/DGER/HGER
template <typename T>
StatusCode Ger(const Layout layout,
const size_t m, const size_t n,
@ -424,7 +424,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric rank-1 matrix update: SSYR/DSYR
// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
template <typename T>
StatusCode Syr(const Layout layout, const Triangle triangle,
const size_t n,
@ -433,7 +433,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric packed rank-1 matrix update: SSPR/DSPR
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
template <typename T>
StatusCode Spr(const Layout layout, const Triangle triangle,
const size_t n,
@ -442,7 +442,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric rank-2 matrix update: SSYR2/DSYR2
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
template <typename T>
StatusCode Syr2(const Layout layout, const Triangle triangle,
const size_t n,
@ -452,7 +452,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
template <typename T>
StatusCode Spr2(const Layout layout, const Triangle triangle,
const size_t n,
@ -466,7 +466,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle,
// BLAS level-3 (matrix-matrix) routines
// =================================================================================================
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
template <typename T>
StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
@ -477,7 +477,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event = nullptr);
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
template <typename T>
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
@ -499,7 +499,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event = nullptr);
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
template <typename T>
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
@ -519,7 +519,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event = nullptr);
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
template <typename T>
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
@ -541,7 +541,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event = nullptr);
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
template <typename T>
StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
@ -550,7 +550,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event = nullptr);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
template <typename T>
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,

View file

@ -148,7 +148,7 @@ StatusCode PUBLIC_API CLBlastDrotm(const size_t n,
cl_mem sparam_buffer, const size_t sparam_offset,
cl_command_queue* queue, cl_event* event);
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
StatusCode PUBLIC_API CLBlastSswap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
@ -165,8 +165,12 @@ StatusCode PUBLIC_API CLBlastZswap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHswap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
StatusCode PUBLIC_API CLBlastSscal(const size_t n,
const float alpha,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -183,8 +187,12 @@ StatusCode PUBLIC_API CLBlastZscal(const size_t n,
const cl_double2 alpha,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHscal(const size_t n,
const cl_half alpha,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
StatusCode PUBLIC_API CLBlastScopy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
@ -201,8 +209,12 @@ StatusCode PUBLIC_API CLBlastZcopy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHcopy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
StatusCode PUBLIC_API CLBlastSaxpy(const size_t n,
const float alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -223,8 +235,13 @@ StatusCode PUBLIC_API CLBlastZaxpy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHaxpy(const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// Dot product of two vectors: SDOT/DDOT
// Dot product of two vectors: SDOT/DDOT/HDOT
StatusCode PUBLIC_API CLBlastSdot(const size_t n,
cl_mem dot_buffer, const size_t dot_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -235,6 +252,11 @@ StatusCode PUBLIC_API CLBlastDdot(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHdot(const size_t n,
cl_mem dot_buffer, const size_t dot_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// Dot product of two complex vectors: CDOTU/ZDOTU
StatusCode PUBLIC_API CLBlastCdotu(const size_t n,
@ -260,7 +282,7 @@ StatusCode PUBLIC_API CLBlastZdotc(const size_t n,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
StatusCode PUBLIC_API CLBlastSnrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -277,8 +299,12 @@ StatusCode PUBLIC_API CLBlastDznrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHnrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
StatusCode PUBLIC_API CLBlastSasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -295,8 +321,12 @@ StatusCode PUBLIC_API CLBlastDzasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
StatusCode PUBLIC_API CLBlastSsum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -313,8 +343,12 @@ StatusCode PUBLIC_API CLBlastDzsum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHsum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
StatusCode PUBLIC_API CLBlastiSamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -331,8 +365,12 @@ StatusCode PUBLIC_API CLBlastiZamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastiHamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
StatusCode PUBLIC_API CLBlastiSmax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -349,8 +387,12 @@ StatusCode PUBLIC_API CLBlastiZmax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastiHmax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
StatusCode PUBLIC_API CLBlastiSmin(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -367,12 +409,16 @@ StatusCode PUBLIC_API CLBlastiZmin(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastiHmin(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
StatusCode PUBLIC_API CLBlastSgemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const float alpha,
@ -405,8 +451,16 @@ StatusCode PUBLIC_API CLBlastZgemv(const Layout layout, const Transpose a_transp
const cl_double2 beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHgemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
StatusCode PUBLIC_API CLBlastSgbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const float alpha,
@ -439,6 +493,14 @@ StatusCode PUBLIC_API CLBlastZgbmv(const Layout layout, const Transpose a_transp
const cl_double2 beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
StatusCode PUBLIC_API CLBlastChemv(const Layout layout, const Triangle triangle,
@ -494,7 +556,7 @@ StatusCode PUBLIC_API CLBlastZhpmv(const Layout layout, const Triangle triangle,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// Symmetric matrix-vector multiplication: SSYMV/DSYMV
// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
StatusCode PUBLIC_API CLBlastSsymv(const Layout layout, const Triangle triangle,
const size_t n,
const float alpha,
@ -511,8 +573,16 @@ StatusCode PUBLIC_API CLBlastDsymv(const Layout layout, const Triangle triangle,
const double beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHsymv(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
StatusCode PUBLIC_API CLBlastSsbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const float alpha,
@ -529,8 +599,16 @@ StatusCode PUBLIC_API CLBlastDsbmv(const Layout layout, const Triangle triangle,
const double beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHsbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
StatusCode PUBLIC_API CLBlastSspmv(const Layout layout, const Triangle triangle,
const size_t n,
const float alpha,
@ -547,8 +625,16 @@ StatusCode PUBLIC_API CLBlastDspmv(const Layout layout, const Triangle triangle,
const double beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHspmv(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem ap_buffer, const size_t ap_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
StatusCode PUBLIC_API CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@ -569,8 +655,13 @@ StatusCode PUBLIC_API CLBlastZtrmv(const Layout layout, const Triangle triangle,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
StatusCode PUBLIC_API CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@ -591,8 +682,13 @@ StatusCode PUBLIC_API CLBlastZtbmv(const Layout layout, const Triangle triangle,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
StatusCode PUBLIC_API CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem ap_buffer, const size_t ap_offset,
@ -613,6 +709,11 @@ StatusCode PUBLIC_API CLBlastZtpmv(const Layout layout, const Triangle triangle,
const cl_mem ap_buffer, const size_t ap_offset,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem ap_buffer, const size_t ap_offset,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
StatusCode PUBLIC_API CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@ -680,7 +781,7 @@ StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
// General rank-1 matrix update: SGER/DGER
// General rank-1 matrix update: SGER/DGER/HGER
StatusCode PUBLIC_API CLBlastSger(const Layout layout,
const size_t m, const size_t n,
const float alpha,
@ -695,6 +796,13 @@ StatusCode PUBLIC_API CLBlastDger(const Layout layout,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHger(const Layout layout,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
// General rank-1 complex matrix update: CGERU/ZGERU
StatusCode PUBLIC_API CLBlastCgeru(const Layout layout,
@ -788,7 +896,7 @@ StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
// Symmetric rank-1 matrix update: SSYR/DSYR
// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle,
const size_t n,
const float alpha,
@ -801,8 +909,14 @@ StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHsyr(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
// Symmetric packed rank-1 matrix update: SSPR/DSPR
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle,
const size_t n,
const float alpha,
@ -815,8 +929,14 @@ StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHspr(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
// Symmetric rank-2 matrix update: SSYR2/DSYR2
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle,
const size_t n,
const float alpha,
@ -831,8 +951,15 @@ StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHsyr2(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle,
const size_t n,
const float alpha,
@ -847,12 +974,19 @@ StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHspr2(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// =================================================================================================
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
StatusCode PUBLIC_API CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const float alpha,
@ -885,8 +1019,16 @@ StatusCode PUBLIC_API CLBlastZgemm(const Layout layout, const Transpose a_transp
const cl_double2 beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
StatusCode PUBLIC_API CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const float alpha,
@ -919,6 +1061,14 @@ StatusCode PUBLIC_API CLBlastZsymm(const Layout layout, const Side side, const T
const cl_double2 beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
StatusCode PUBLIC_API CLBlastChemm(const Layout layout, const Side side, const Triangle triangle,
@ -938,7 +1088,7 @@ StatusCode PUBLIC_API CLBlastZhemm(const Layout layout, const Side side, const T
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
StatusCode PUBLIC_API CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const float alpha,
@ -967,6 +1117,13 @@ StatusCode PUBLIC_API CLBlastZsyrk(const Layout layout, const Triangle triangle,
const cl_double2 beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Rank-K update of a hermitian matrix: CHERK/ZHERK
StatusCode PUBLIC_API CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
@ -984,7 +1141,7 @@ StatusCode PUBLIC_API CLBlastZherk(const Layout layout, const Triangle triangle,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
StatusCode PUBLIC_API CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const float alpha,
@ -1017,6 +1174,14 @@ StatusCode PUBLIC_API CLBlastZsyr2k(const Layout layout, const Triangle triangle
const cl_double2 beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
StatusCode PUBLIC_API CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
@ -1036,7 +1201,7 @@ StatusCode PUBLIC_API CLBlastZher2k(const Layout layout, const Triangle triangle
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
StatusCode PUBLIC_API CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const float alpha,
@ -1061,8 +1226,14 @@ StatusCode PUBLIC_API CLBlastZtrmm(const Layout layout, const Side side, const T
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
StatusCode PUBLIC_API CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const float alpha,
@ -1087,6 +1258,12 @@ StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const T
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
StatusCode PUBLIC_API CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event);
// =================================================================================================

256
include/clblast_half.h Normal file
View file

@ -0,0 +1,256 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file provides simple conversion operations between fp16 (half) and fp32 (float). These
// conversion functions are based on ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf and
// are also part of the C++ half-precision header (http://half.sourceforge.net/).
//
// This file is pure C99.
//
// =================================================================================================
#ifndef CLBLAST_HALF_H_
#define CLBLAST_HALF_H_
// Includes the normal OpenCL C header
#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// =================================================================================================
// Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,
// which is a typedef for unsigned short.
typedef cl_half half;
// 32-bit union for conversions
typedef union ConversionBits_ {
unsigned int i32;
float f32;
} ConversionBits;
// =================================================================================================
// Converts a IEEE-compliant single-precision value to half-precision floating-point. This function
// applies simple truncation (round toward zero, but with overflows set to infinity) as rounding
// mode.
inline half FloatToHalf(const float value) {
static const unsigned short base_table[512] = {
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00
};
static const unsigned char shift_table[512] = {
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13
};
ConversionBits bits;
bits.f32 = value;
const unsigned short halfbits = base_table[bits.i32 >> 23] +
(unsigned short)((bits.i32 & 0x7FFFFF) >> shift_table[bits.i32 >> 23]);
return halfbits;
}
// Converts a half-precision value to IEEE-compliant single-precision floating-point
inline float HalfToFloat(const half value) {
static const unsigned int mantissa_table[2048] = {
0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,
0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,
0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,
0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,
0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,
0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,
0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,
0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,
0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,
0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,
0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,
0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,
0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,
0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,
0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,
0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,
0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,
0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,
0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,
0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000,
0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,
0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,
0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,
0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,
0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,
0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,
0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,
0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,
0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,
0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,
0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,
0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,
0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,
0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,
0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,
0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,
0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,
0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,
0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,
0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,
0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,
0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,
0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,
0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,
0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,
0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000,
0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,
0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,
0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,
0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,
0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,
0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,
0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,
0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,
0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,
0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,
0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,
0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,
0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,
0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,
0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,
0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,
0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,
0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,
0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,
0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,
0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,
0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,
0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,
0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,
0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,
0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000,
0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,
0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,
0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,
0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,
0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,
0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,
0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,
0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,
0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,
0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,
0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,
0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,
0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,
0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,
0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,
0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,
0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,
0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,
0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,
0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,
0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,
0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,
0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,
0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,
0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,
0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,
0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,
0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,
0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,
0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,
0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,
0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,
0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,
0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000
};
static const unsigned int exponent_table[64] = {
0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000,
0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,
0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000
};
static const unsigned short offset_table[64] = {
0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024
};
ConversionBits bits;
bits.i32 = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] +
exponent_table[value >> 10];
return bits.f32;
}
// =================================================================================================
// CLBLAST_HALF_H_
#endif

View file

@ -67,15 +67,15 @@ class Database {
};
// The database consists of separate database entries, stored together in a vector
static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
static const DatabaseEntry XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
static const DatabaseEntry XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
static const DatabaseEntry TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
static const DatabaseEntry PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
static const std::vector<DatabaseEntry> database;
// The constructor

View file

@ -14,6 +14,24 @@
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::CopyHalf = {
"Copy", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::CopySingle = {
"Copy", Precision::kSingle, {
{ // AMD GPUs

View file

@ -14,6 +14,24 @@
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::PadHalf = {
"Pad", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::PadSingle = {
"Pad", Precision::kSingle, {
{ // AMD GPUs

View file

@ -14,6 +14,24 @@
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::PadtransposeHalf = {
"Padtranspose", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::PadtransposeSingle = {
"Padtranspose", Precision::kSingle, {
{ // AMD GPUs

View file

@ -14,6 +14,24 @@
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::TransposeHalf = {
"Transpose", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::TransposeSingle = {
"Transpose", Precision::kSingle, {
{ // AMD GPUs

View file

@ -14,6 +14,24 @@
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::XaxpyHalf = {
"Xaxpy", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
{ "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XaxpySingle = {
"Xaxpy", Precision::kSingle, {
{ // AMD GPUs

View file

@ -14,6 +14,24 @@
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::XdotHalf = {
"Xdot", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
{ "default", { {"WGS1",32}, {"WGS2",32} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",32}, {"WGS2",32} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XdotSingle = {
"Xdot", Precision::kSingle, {
{ // AMD GPUs

View file

@ -14,6 +14,18 @@
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::XgemmHalf = {
"Xgemm", Precision::kHalf, {
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemmSingle = {
"Xgemm", Precision::kSingle, {
{ // AMD GPUs

View file

@ -14,6 +14,24 @@
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::XgemvHalf = {
"Xgemv", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgemvSingle = {
"Xgemv", Precision::kSingle, {
{ // AMD GPUs

View file

@ -14,6 +14,24 @@
namespace clblast {
// =================================================================================================
const Database::DatabaseEntry Database::XgerHalf = {
"Xger", Precision::kHalf, {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
}
},
}
};
// =================================================================================================
const Database::DatabaseEntry Database::XgerSingle = {
"Xger", Precision::kSingle, {
{ // AMD GPUs

View file

@ -29,6 +29,7 @@ class Xaxpy: public Routine<T> {
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;

View file

@ -29,6 +29,7 @@ class Xgemv: public Routine<T> {
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;

View file

@ -29,6 +29,7 @@ class Xger: public Routine<T> {
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;

View file

@ -29,6 +29,7 @@ class Xher: public Routine<T> {
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestMatrixA;

View file

@ -29,6 +29,7 @@ class Xher2: public Routine<T> {
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;

View file

@ -20,6 +20,8 @@
#include <cltune.h>
#include "internal/utilities.h"
namespace clblast {
// =================================================================================================

View file

@ -22,6 +22,7 @@
#include <complex>
#include "clblast.h"
#include "clblast_half.h"
#include "internal/clpp11.h"
namespace clblast {
@ -94,6 +95,16 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
// =================================================================================================
// Returns a scalar with a default value
template <typename T>
T GetScalar();
// Returns a scalar of value 1
template <typename T>
T ConstantOne();
// =================================================================================================
// Structure containing all possible arguments for test clients, including their default values
template <typename T>
struct Arguments {
@ -124,8 +135,8 @@ struct Arguments {
size_t nrm2_offset = 0;
size_t asum_offset = 0;
size_t imax_offset = 0;
T alpha = T{1.0};
T beta = T{1.0};
T alpha = ConstantOne<T>();
T beta = ConstantOne<T>();
size_t x_size = 1;
size_t y_size = 1;
size_t a_size = 1;
@ -202,9 +213,13 @@ void PopulateVector(std::vector<T> &vector);
// =================================================================================================
// Returns a scalar with a default value
template <typename T>
T GetScalar();
// Conversion between half and single-precision
std::vector<float> HalfToFloatBuffer(const std::vector<half>& source);
void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source);
// As above, but now for OpenCL data-types instead of std::vectors
Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
// =================================================================================================

105
samples/haxpy.c Normal file
View file

@ -0,0 +1,105 @@
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file demonstrates the use of the HAXPY routine. It demonstrates the use of half-precision.
//
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
//
// =================================================================================================
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
// Includes the CLBlast library (C interface)
#include <clblast_c.h>
// Includes the float-to-half and half-to-float conversion utilities
#include <clblast_half.h>
// =================================================================================================
// Example use of the half-precision routine HAXPY
int main(void) {
// OpenCL platform/device settings
const size_t platform_id = 0;
const size_t device_id = 0;
// Example HAXPY arguments
const size_t n = 8192;
const cl_half alpha = FloatToHalf(0.5f);
// Initializes the OpenCL platform
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(num_platforms, platforms, NULL);
cl_platform_id platform = platforms[platform_id];
// Initializes the OpenCL device
cl_uint num_devices;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
cl_device_id device = devices[device_id];
// Creates the OpenCL context, queue, and an event
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
cl_event event = NULL;
// Populate host vectors with some example data
cl_half* host_a = (cl_half*)malloc(sizeof(cl_half)*n);
cl_half* host_b = (cl_half*)malloc(sizeof(cl_half)*n);
for (size_t i=0; i<n; ++i) { host_a[i] = FloatToHalf(2.2f); }
for (size_t i=0; i<n; ++i) { host_b[i] = FloatToHalf(0.4f); }
printf("Input values at index 0: alpha * a[0] + b[0] == %.3lf * %.3lf + %.3lf\n",
HalfToFloat(alpha), HalfToFloat(host_a[0]), HalfToFloat(host_b[0]));
// Copy the matrices to the device
cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(cl_half), NULL, NULL);
cl_mem device_b = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(cl_half), NULL, NULL);
clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, n*sizeof(cl_half), host_a, 0, NULL, NULL);
clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
// Call the HAXPY routine.
StatusCode status = CLBlastHaxpy(n, alpha,
device_a, 0, 1,
device_b, 0, 1,
&queue, &event);
// Wait for completion
clWaitForEvents(1, &event);
// Copies the result back to the host
clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed HAXPY with status %d\n", status);
// Prints the first output value
if (status == 0) {
printf("Output value at index 0: b[0] = %.3lf\n", HalfToFloat(host_b[0]));
}
// Clean-up
free(platforms);
free(devices);
free(host_a);
free(host_b);
clReleaseMemObject(device_a);
clReleaseMemObject(device_b);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
// =================================================================================================

View file

@ -189,13 +189,20 @@ def GetFooter():
# The start of a new C++ precision entry
def GetPrecision(family, precision):
precisionstring = "Single"
if precision == "64":
precisionstring = ""
if precision == "16":
precisionstring = "Half"
elif precision == "32":
precisionstring = "Single"
elif precision == "64":
precisionstring = "Double"
elif precision == "3232":
precisionstring = "ComplexSingle"
elif precision == "6464":
precisionstring = "ComplexDouble"
else:
print("[ERROR] Unknown precision")
sys.exit()
return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n \"%s\", Precision::k%s, {\n"
% (family.title(), precisionstring, family.title(), precisionstring))

View file

@ -13,10 +13,13 @@
# ==================================================================================================
# Short-hands for data-types
HLF = "half"
FLT = "float"
DBL = "double"
FLT2 = "float2"
DBL2 = "double2"
HCL = "cl_half"
F2CL = "cl_float2"
D2CL = "cl_double2"

View file

@ -28,11 +28,12 @@ import os.path
# Local files
from routine import Routine
from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL
from datatype import DataType, HLF, FLT, DBL, FLT2, DBL2, HCL, F2CL, D2CL
# ==================================================================================================
# Regular data-types
H = DataType("H", "H", HLF, [HLF, HLF, HCL, HCL], HLF ) # half (16)
S = DataType("S", "S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32)
D = DataType("D", "D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64)
C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
@ -41,6 +42,7 @@ Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6
# Special cases
Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output
Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output
iH = DataType("H", "iH", HLF, [HLF, HLF, HLF, HLF], HLF ) # As H, but with integer output
iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output
iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output
iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output
@ -60,62 +62,62 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T")
# Populates a list of routines
routines = [
[ # Level 1: vector-vector
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
Routine(True, True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []),
Routine(True, True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []),
Routine(True, True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []),
Routine(True, True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []),
Routine(True, True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []),
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []),
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []),
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []),
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []),
Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []),
Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []),
Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []),
Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []),
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []),
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []),
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []),
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
],
[ # Level 2: matrix-vector
Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []),
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []),
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []),
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []),
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []),
Routine(True, True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []),
Routine(True, True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []),
Routine(True, True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []),
Routine(True, True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []),
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []),
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []),
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []),
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []),
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []),
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []),
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []),
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []),
Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []),
Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []),
Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []),
Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []),
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []),
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []),
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []),
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
# Level 2: matrix update
Routine(True, True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []),
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []),
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []),
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []),
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []),
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []),
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []),
Routine(True, True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []),
Routine(True, True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []),
Routine(True, True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []),
Routine(True, True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []),
Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []),
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []),
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []),
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []),
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []),
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []),
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []),
Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []),
Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []),
Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []),
Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []),
],
[ # Level 3: matrix-matrix
Routine(True, True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []),
Routine(True, True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []),
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []),
Routine(True, True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []),
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []),
Routine(True, True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []),
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []),
Routine(True, True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []),
Routine(False, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []),
Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []),
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []),
Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []),
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []),
Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []),
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []),
Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []),
Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
]]
# ==================================================================================================
@ -229,21 +231,45 @@ def wrapper_clblas(routines):
result = ""
for routine in routines:
if routine.has_tests:
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNamesTested())
if routine.NoScalars():
result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
for flavour in routine.flavours:
indent = " "*(17 + routine.Length())
result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
arguments = routine.ArgumentsWrapperCL(flavour)
if routine.scratch:
result += " auto queue = Queue(queues[0]);\n"
result += " auto context = queue.GetContext();\n"
result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
arguments += ["scratch_buffer()"]
result += " return clblas"+flavour.name+routine.name+"("
result += (",\n"+indent).join([a for a in arguments])
result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
# There is a version available in clBLAS
if flavour.precision_name in ["S","D","C","Z"]:
indent = " "*(17 + routine.Length())
arguments = routine.ArgumentsWrapperCL(flavour)
if routine.scratch:
result += " auto queue = Queue(queues[0]);\n"
result += " auto context = queue.GetContext();\n"
result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
arguments += ["scratch_buffer()"]
result += " return clblas"+flavour.name+routine.name+"("
result += (",\n"+indent).join([a for a in arguments])
result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
# There is no clBLAS available, forward the call to one of the available functions
else: # Half-precision
indent = " "*(24 + routine.Length())
# Convert to float (note: also integer buffers are stored as half/float)
for buf in routine.inputs + routine.outputs:
result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer, queues[0]);\n"
# Call the float routine
result += " auto status = clblasX"+routine.name+"("
result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()])
result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
result += "\n"
# Convert back to half
for buf in routine.outputs:
result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis, queues[0]);\n"
result += " return status;"
# Complete
result += "\n}\n"
return result
@ -252,44 +278,66 @@ def wrapper_cblas(routines):
result = ""
for routine in routines:
if routine.has_tests:
result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames())
result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNamesTested())
for flavour in routine.flavours:
indent = " "*(10 + routine.Length())
result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
arguments = routine.ArgumentsWrapperC(flavour)
# Double-precision scalars
for scalar in routine.scalars:
if flavour.IsComplex(scalar):
result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
# There is a version available in CBLAS
if flavour.precision_name in ["S","D","C","Z"]:
indent = " "*(10 + routine.Length())
arguments = routine.ArgumentsWrapperC(flavour)
# Special case for scalar outputs
assignment = ""
postfix = ""
endofline = ""
extra_argument = ""
for output_buffer in routine.outputs:
if output_buffer in routine.ScalarBuffersFirst():
if flavour in [C,Z]:
postfix += "_sub"
indent += " "
extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
elif output_buffer in routine.IndexBuffers():
assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
indent += " "*len(assignment)
else:
assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
if (flavour.name in ["Sc","Dz"]):
assignment = assignment+".real("
endofline += ")"
# Complex scalars
for scalar in routine.scalars:
if flavour.IsComplex(scalar):
result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
# Special case for scalar outputs
assignment = ""
postfix = ""
endofline = ""
extra_argument = ""
for output_buffer in routine.outputs:
if output_buffer in routine.ScalarBuffersFirst():
if flavour in [C,Z]:
postfix += "_sub"
indent += " "
extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
elif output_buffer in routine.IndexBuffers():
assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
indent += " "*len(assignment)
else:
assignment = assignment+" = "
indent += " "*len(assignment)
assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
if (flavour.name in ["Sc","Dz"]):
assignment = assignment+".real("
endofline += ")"
else:
assignment = assignment+" = "
indent += " "*len(assignment)
result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
result += (",\n"+indent).join([a for a in arguments])
result += extra_argument+endofline+");"
result += "\n}\n"
result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
result += (",\n"+indent).join([a for a in arguments])
result += extra_argument+endofline+");\n"
# There is no CBLAS available, forward the call to one of the available functions
else: # Half-precision
indent = " "*(9 + routine.Length())
# Convert to float (note: also integer buffers are stored as half/float)
for buf in routine.inputs + routine.outputs:
result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer);\n"
# Call the float routine
result += " cblasX"+routine.name+"("
result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()])
result += ");\n"
# Convert back to half
for buf in routine.outputs:
result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis);\n"
# Complete
result += "}\n"
return result
# ==================================================================================================

View file

@ -99,6 +99,18 @@ class Routine():
def IndexBuffers(self):
return ["imax","imin"]
# Lists of input/output buffers not index (integer)
def NonIndexInputs(self):
buffers = self.inputs[:] # make a copy
for i in self.IndexBuffers():
if i in buffers: buffers.remove(i)
return buffers
def NonIndexOutputs(self):
buffers = self.outputs[:] # make a copy
for i in self.IndexBuffers():
if i in buffers: buffers.remove(i)
return buffers
# List of buffers without 'inc' or 'ld'
def BuffersWithoutLdInc(self):
return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"]
@ -119,6 +131,12 @@ class Routine():
def ShortNames(self):
return "/".join([f.name+self.name.upper() for f in self.flavours])
# As above, but excludes some
def ShortNamesTested(self):
names = [f.name+self.name.upper() for f in self.flavours]
if "H"+self.name.upper() in names: names.remove("H"+self.name.upper())
return "/".join(names)
# Determines which buffers go first (between alpha and beta) and which ones go after
def BuffersFirst(self):
if self.level == "2b":
@ -146,6 +164,17 @@ class Routine():
return [", ".join(a+b+c)]
return []
# As above but with a '_bis' suffix for the buffer name
def BufferBis(self, name):
#if (name in self.IndexBuffers()):
# return self.Buffer(name)
if (name in self.inputs) or (name in self.outputs):
a = [name+"_buffer_bis"]
b = [name+"_offset"]
c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but with data-types
def BufferDef(self, name):
prefix = "const " if (name in self.inputs) else ""
@ -156,6 +185,16 @@ class Routine():
return [", ".join(a+b+c)]
return []
# As above but with data-types
def BufferDefWrapperCL(self, name, flavour):
prefix = "const " if (name in self.inputs) else ""
if (name in self.inputs) or (name in self.outputs):
a = [prefix+"Buffer<"+flavour.buffertype+">& "+name+"_buffer"]
b = ["const size_t "+name+"_offset"]
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
return [", ".join(a+b+c)]
return []
# As above but as vectors
def BufferDefVector(self, name, flavour):
prefix = "const " if (name in self.inputs) else ""
@ -179,7 +218,7 @@ class Routine():
# As above but with a static cast for clBLAS wrapper
def BufferWrapperCL(self, name):
if (name in self.inputs) or (name in self.outputs):
a = [name+"_buffer"]
a = [name+"_buffer()"]
b = [name+"_offset"]
c = []
if (name in ["x","y"]):
@ -238,6 +277,12 @@ class Routine():
return [name]
return []
# As above, but converts from float to half
def ScalarHalfToFloat(self, name):
if name in self.scalars:
return ["HalfToFloat("+name+")"]
return []
# Retrieves the use of a scalar (alpha/beta)
def ScalarUse(self, name, flavour):
if name in self.scalars:
@ -248,7 +293,7 @@ class Routine():
return [name]
return []
# Retrieves the use of a scalar (alpha/beta)
# As above, but for the clBLAS wrapper
def ScalarUseWrapper(self, name, flavour):
if name in self.scalars:
if name == "alpha":
@ -258,7 +303,7 @@ class Routine():
return [name]
return []
# Retrieves the use of a scalar for CBLAS (alpha/beta)
# As above, but for the CBLAS wrapper
def ScalarUseWrapperC(self, name, flavour):
if name in self.scalars:
if flavour.IsComplex(name):
@ -377,6 +422,28 @@ class Routine():
# ==============================================================================================
# Retrieves a combination of all the argument names (no types)
def Arguments(self):
return (self.Options() + self.Sizes() +
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
self.Scalar("alpha") +
list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
self.Scalar("beta") +
list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
# As above, but with conversions from half to float
def ArgumentsHalf(self):
return (self.Options() + self.Sizes() +
list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersFirst()])) +
self.ScalarHalfToFloat("alpha") +
list(chain(*[self.BufferBis(b) for b in self.BuffersFirst()])) +
self.ScalarHalfToFloat("beta") +
list(chain(*[self.BufferBis(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument names, with Claduc casts
def ArgumentsCladuc(self, flavour, indent):
return (self.Options() + self.Sizes() +
@ -388,7 +455,7 @@ class Routine():
list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
# Retrieves a combination of all the argument names, with CLBlast casts
# As above, but with CLBlast casts
def ArgumentsCast(self, flavour, indent):
return (self.OptionsCast(indent) + self.Sizes() +
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
@ -434,12 +501,12 @@ class Routine():
# As above, but clBLAS wrapper plain datatypes
def ArgumentsDefWrapperCL(self, flavour):
return (self.OptionsDefWrapperCL() + self.SizesDef() +
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersFirst()])) +
self.ScalarDefPlain("alpha", flavour) +
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersFirst()])) +
self.ScalarDefPlain("beta", flavour) +
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersSecond()])) +
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersSecond()])) +
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
# As above, but CBLAS wrapper plain datatypes

View file

@ -160,7 +160,7 @@ template StatusCode PUBLIC_API Rotm<double>(const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
template <typename T>
StatusCode Swap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -190,8 +190,12 @@ template StatusCode PUBLIC_API Swap<double2>(const size_t,
cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Swap<half>(const size_t,
cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
template <typename T>
StatusCode Scal(const size_t n,
const T alpha,
@ -221,8 +225,12 @@ template StatusCode PUBLIC_API Scal<double2>(const size_t,
const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Scal<half>(const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
template <typename T>
StatusCode Copy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@ -252,8 +260,12 @@ template StatusCode PUBLIC_API Copy<double2>(const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Copy<half>(const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
template <typename T>
StatusCode Axpy(const size_t n,
const T alpha,
@ -289,8 +301,13 @@ template StatusCode PUBLIC_API Axpy<double2>(const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Axpy<half>(const size_t,
const half,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Dot product of two vectors: SDOT/DDOT
// Dot product of two vectors: SDOT/DDOT/HDOT
template <typename T>
StatusCode Dot(const size_t n,
cl_mem dot_buffer, const size_t dot_offset,
@ -316,6 +333,11 @@ template StatusCode PUBLIC_API Dot<double>(const size_t,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Dot<half>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Dot product of two complex vectors: CDOTU/ZDOTU
template <typename T>
@ -371,7 +393,7 @@ template StatusCode PUBLIC_API Dotc<double2>(const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
template <typename T>
StatusCode Nrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
@ -401,8 +423,12 @@ template StatusCode PUBLIC_API Nrm2<double2>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Nrm2<half>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
template <typename T>
StatusCode Asum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
@ -432,8 +458,12 @@ template StatusCode PUBLIC_API Asum<double2>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Asum<half>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
template <typename T>
StatusCode Sum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
@ -463,8 +493,12 @@ template StatusCode PUBLIC_API Sum<double2>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Sum<half>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
template <typename T>
StatusCode Amax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
@ -494,8 +528,12 @@ template StatusCode PUBLIC_API Amax<double2>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Amax<half>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
template <typename T>
StatusCode Max(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
@ -525,8 +563,12 @@ template StatusCode PUBLIC_API Max<double2>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Max<half>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
template <typename T>
StatusCode Min(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
@ -556,12 +598,16 @@ template StatusCode PUBLIC_API Min<double2>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Min<half>(const size_t,
cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
// =================================================================================================
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
template <typename T>
StatusCode Gemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
@ -615,8 +661,16 @@ template StatusCode PUBLIC_API Gemv<double2>(const Layout, const Transpose,
const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Gemv<half>(const Layout, const Transpose,
const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
template <typename T>
StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
@ -670,6 +724,14 @@ template StatusCode PUBLIC_API Gbmv<double2>(const Layout, const Transpose,
const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Gbmv<half>(const Layout, const Transpose,
const size_t, const size_t, const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
template <typename T>
@ -788,7 +850,7 @@ template StatusCode PUBLIC_API Hpmv<double2>(const Layout, const Triangle,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Symmetric matrix-vector multiplication: SSYMV/DSYMV
// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
template <typename T>
StatusCode Symv(const Layout layout, const Triangle triangle,
const size_t n,
@ -826,8 +888,16 @@ template StatusCode PUBLIC_API Symv<double>(const Layout, const Triangle,
const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Symv<half>(const Layout, const Triangle,
const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
template <typename T>
StatusCode Sbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
@ -865,8 +935,16 @@ template StatusCode PUBLIC_API Sbmv<double>(const Layout, const Triangle,
const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Sbmv<half>(const Layout, const Triangle,
const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
template <typename T>
StatusCode Spmv(const Layout layout, const Triangle triangle,
const size_t n,
@ -904,8 +982,16 @@ template StatusCode PUBLIC_API Spmv<double>(const Layout, const Triangle,
const double,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Spmv<half>(const Layout, const Triangle,
const size_t,
const half,
const cl_mem, const size_t,
const cl_mem, const size_t, const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
template <typename T>
StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
@ -941,8 +1027,13 @@ template StatusCode PUBLIC_API Trmv<double2>(const Layout, const Triangle, const
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Trmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
template <typename T>
StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
@ -978,8 +1069,13 @@ template StatusCode PUBLIC_API Tbmv<double2>(const Layout, const Triangle, const
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Tbmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
template <typename T>
StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
@ -1015,6 +1111,11 @@ template StatusCode PUBLIC_API Tpmv<double2>(const Layout, const Triangle, const
const cl_mem, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
const size_t,
const cl_mem, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
template <typename T>
@ -1106,7 +1207,7 @@ template StatusCode PUBLIC_API Tpsv<double2>(const Layout, const Triangle, const
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// General rank-1 matrix update: SGER/DGER
// General rank-1 matrix update: SGER/DGER/HGER
template <typename T>
StatusCode Ger(const Layout layout,
const size_t m, const size_t n,
@ -1140,6 +1241,13 @@ template StatusCode PUBLIC_API Ger<double>(const Layout,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Ger<half>(const Layout,
const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// General rank-1 complex matrix update: CGERU/ZGERU
template <typename T>
@ -1343,7 +1451,7 @@ template StatusCode PUBLIC_API Hpr2<double2>(const Layout, const Triangle,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
// Symmetric rank-1 matrix update: SSYR/DSYR
// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
template <typename T>
StatusCode Syr(const Layout layout, const Triangle triangle,
const size_t n,
@ -1373,8 +1481,14 @@ template StatusCode PUBLIC_API Syr<double>(const Layout, const Triangle,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Syr<half>(const Layout, const Triangle,
const size_t,
const half,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Symmetric packed rank-1 matrix update: SSPR/DSPR
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
template <typename T>
StatusCode Spr(const Layout layout, const Triangle triangle,
const size_t n,
@ -1404,8 +1518,14 @@ template StatusCode PUBLIC_API Spr<double>(const Layout, const Triangle,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Spr<half>(const Layout, const Triangle,
const size_t,
const half,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
// Symmetric rank-2 matrix update: SSYR2/DSYR2
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
template <typename T>
StatusCode Syr2(const Layout layout, const Triangle triangle,
const size_t n,
@ -1439,8 +1559,15 @@ template StatusCode PUBLIC_API Syr2<double>(const Layout, const Triangle,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Syr2<half>(const Layout, const Triangle,
const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
template <typename T>
StatusCode Spr2(const Layout layout, const Triangle triangle,
const size_t n,
@ -1474,12 +1601,19 @@ template StatusCode PUBLIC_API Spr2<double>(const Layout, const Triangle,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Spr2<half>(const Layout, const Triangle,
const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
// =================================================================================================
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
template <typename T>
StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
@ -1533,8 +1667,16 @@ template StatusCode PUBLIC_API Gemm<double2>(const Layout, const Transpose, cons
const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Gemm<half>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
template <typename T>
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
@ -1588,6 +1730,14 @@ template StatusCode PUBLIC_API Symm<double2>(const Layout, const Side, const Tri
const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Symm<half>(const Layout, const Side, const Triangle,
const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
template <typename T>
@ -1628,7 +1778,7 @@ template StatusCode PUBLIC_API Hemm<double2>(const Layout, const Side, const Tri
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
template <typename T>
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
@ -1676,6 +1826,13 @@ template StatusCode PUBLIC_API Syrk<double2>(const Layout, const Triangle, const
const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Syrk<half>(const Layout, const Triangle, const Transpose,
const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Rank-K update of a hermitian matrix: CHERK/ZHERK
template <typename T>
@ -1712,7 +1869,7 @@ template StatusCode PUBLIC_API Herk<double>(const Layout, const Triangle, const
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
template <typename T>
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
@ -1766,6 +1923,14 @@ template StatusCode PUBLIC_API Syr2k<double2>(const Layout, const Triangle, cons
const double2,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Syr2k<half>(const Layout, const Triangle, const Transpose,
const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
const cl_mem, const size_t, const size_t,
const half,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
template <typename T, typename U>
@ -1806,7 +1971,7 @@ template StatusCode PUBLIC_API Her2k<double2,double>(const Layout, const Triangl
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
template <typename T>
StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
@ -1848,8 +2013,14 @@ template StatusCode PUBLIC_API Trmm<double2>(const Layout, const Side, const Tri
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
template <typename T>
StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
@ -1883,6 +2054,12 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
const size_t, const size_t,
const half,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
// =================================================================================================

View file

@ -178,6 +178,16 @@ StatusCode CLBlastZswap(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHswap(const size_t n,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Swap<half>(n,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// SCAL
StatusCode CLBlastSscal(const size_t n,
@ -220,6 +230,16 @@ StatusCode CLBlastZscal(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHscal(const size_t n,
const cl_half alpha,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Scal(n,
alpha,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// COPY
StatusCode CLBlastScopy(const size_t n,
@ -262,6 +282,16 @@ StatusCode CLBlastZcopy(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHcopy(const size_t n,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Copy<half>(n,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// AXPY
StatusCode CLBlastSaxpy(const size_t n,
@ -312,6 +342,18 @@ StatusCode CLBlastZaxpy(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHaxpy(const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Axpy(n,
alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// DOT
StatusCode CLBlastSdot(const size_t n,
@ -338,6 +380,18 @@ StatusCode CLBlastDdot(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHdot(const size_t n,
cl_mem dot_buffer, const size_t dot_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Dot<half>(n,
dot_buffer, dot_offset,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// DOTU
StatusCode CLBlastCdotu(const size_t n,
@ -432,6 +486,16 @@ StatusCode CLBlastDznrm2(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHnrm2(const size_t n,
cl_mem nrm2_buffer, const size_t nrm2_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Nrm2<half>(n,
nrm2_buffer, nrm2_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// ASUM
StatusCode CLBlastSasum(const size_t n,
@ -474,6 +538,16 @@ StatusCode CLBlastDzasum(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHasum(const size_t n,
cl_mem asum_buffer, const size_t asum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Asum<half>(n,
asum_buffer, asum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// SUM
StatusCode CLBlastSsum(const size_t n,
@ -516,6 +590,16 @@ StatusCode CLBlastDzsum(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHsum(const size_t n,
cl_mem sum_buffer, const size_t sum_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Sum<half>(n,
sum_buffer, sum_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// AMAX
StatusCode CLBlastiSamax(const size_t n,
@ -558,6 +642,16 @@ StatusCode CLBlastiZamax(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiHamax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Amax<half>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// MAX
StatusCode CLBlastiSmax(const size_t n,
@ -600,6 +694,16 @@ StatusCode CLBlastiZmax(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiHmax(const size_t n,
cl_mem imax_buffer, const size_t imax_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Max<half>(n,
imax_buffer, imax_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// MIN
StatusCode CLBlastiSmin(const size_t n,
@ -642,6 +746,16 @@ StatusCode CLBlastiZmin(const size_t n,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastiHmin(const size_t n,
cl_mem imin_buffer, const size_t imin_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Min<half>(n,
imin_buffer, imin_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// =================================================================================================
// BLAS level-2 (matrix-vector) routines
@ -724,6 +838,25 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Gemv(static_cast<clblast::Layout>(layout),
static_cast<clblast::Transpose>(a_transpose),
m, n,
alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc,
beta,
y_buffer, y_offset, y_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// GBMV
StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose,
@ -802,6 +935,25 @@ StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n, const size_t kl, const size_t ku,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout),
static_cast<clblast::Transpose>(a_transpose),
m, n, kl, ku,
alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc,
beta,
y_buffer, y_offset, y_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// HEMV
StatusCode CLBlastChemv(const Layout layout, const Triangle triangle,
@ -962,6 +1114,25 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Symv(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
n,
alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc,
beta,
y_buffer, y_offset, y_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// SBMV
StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle,
@ -1002,6 +1173,25 @@ StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle,
const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
n, k,
alpha,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc,
beta,
y_buffer, y_offset, y_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// SPMV
StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle,
@ -1042,6 +1232,25 @@ StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem ap_buffer, const size_t ap_offset,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_half beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Spmv(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
n,
alpha,
ap_buffer, ap_offset,
x_buffer, x_offset, x_inc,
beta,
y_buffer, y_offset, y_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// TRMV
StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@ -1104,6 +1313,21 @@ StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Tran
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Trmv<half>(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Diagonal>(diagonal),
n,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// TBMV
StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@ -1166,6 +1390,21 @@ StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Tran
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n, const size_t k,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Tbmv<half>(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Diagonal>(diagonal),
n, k,
a_buffer, a_offset, a_ld,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// TPMV
StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@ -1228,6 +1467,21 @@ StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Tran
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const cl_mem ap_buffer, const size_t ap_offset,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Tpmv<half>(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Diagonal>(diagonal),
n,
ap_buffer, ap_offset,
x_buffer, x_offset, x_inc,
queue, event);
return static_cast<StatusCode>(status);
}
// TRSV
StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@ -1448,6 +1702,22 @@ StatusCode CLBlastDger(const Layout layout,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHger(const Layout layout,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Ger(static_cast<clblast::Layout>(layout),
m, n,
alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld,
queue, event);
return static_cast<StatusCode>(status);
}
// GERU
StatusCode CLBlastCgeru(const Layout layout,
@ -1684,6 +1954,21 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Syr(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
n,
alpha,
x_buffer, x_offset, x_inc,
a_buffer, a_offset, a_ld,
queue, event);
return static_cast<StatusCode>(status);
}
// SPR
StatusCode CLBlastSspr(const Layout layout, const Triangle triangle,
@ -1716,6 +2001,21 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHspr(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Spr(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
n,
alpha,
x_buffer, x_offset, x_inc,
ap_buffer, ap_offset,
queue, event);
return static_cast<StatusCode>(status);
}
// SYR2
StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle,
@ -1752,6 +2052,23 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Syr2(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
n,
alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
a_buffer, a_offset, a_ld,
queue, event);
return static_cast<StatusCode>(status);
}
// SPR2
StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle,
@ -1788,6 +2105,23 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
const size_t n,
const cl_half alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Spr2(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
n,
alpha,
x_buffer, x_offset, x_inc,
y_buffer, y_offset, y_inc,
ap_buffer, ap_offset,
queue, event);
return static_cast<StatusCode>(status);
}
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
@ -1874,6 +2208,26 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Gemm(static_cast<clblast::Layout>(layout),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Transpose>(b_transpose),
m, n, k,
alpha,
a_buffer, a_offset, a_ld,
b_buffer, b_offset, b_ld,
beta,
c_buffer, c_offset, c_ld,
queue, event);
return static_cast<StatusCode>(status);
}
// SYMM
StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle,
@ -1956,6 +2310,26 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Symm(static_cast<clblast::Layout>(layout),
static_cast<clblast::Side>(side),
static_cast<clblast::Triangle>(triangle),
m, n,
alpha,
a_buffer, a_offset, a_ld,
b_buffer, b_offset, b_ld,
beta,
c_buffer, c_offset, c_ld,
queue, event);
return static_cast<StatusCode>(status);
}
// HEMM
StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle,
@ -2072,6 +2446,24 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Syrk(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
static_cast<clblast::Transpose>(a_transpose),
n, k,
alpha,
a_buffer, a_offset, a_ld,
beta,
c_buffer, c_offset, c_ld,
queue, event);
return static_cast<StatusCode>(status);
}
// HERK
StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
@ -2192,6 +2584,26 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
const cl_half beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout),
static_cast<clblast::Triangle>(triangle),
static_cast<clblast::Transpose>(ab_transpose),
n, k,
alpha,
a_buffer, a_offset, a_ld,
b_buffer, b_offset, b_ld,
beta,
c_buffer, c_offset, c_ld,
queue, event);
return static_cast<StatusCode>(status);
}
// HER2K
StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
@ -2308,6 +2720,24 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Trmm(static_cast<clblast::Layout>(layout),
static_cast<clblast::Side>(side),
static_cast<clblast::Triangle>(triangle),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Diagonal>(diagonal),
m, n,
alpha,
a_buffer, a_offset, a_ld,
b_buffer, b_offset, b_ld,
queue, event);
return static_cast<StatusCode>(status);
}
// TRSM
StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@ -2382,6 +2812,24 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri
queue, event);
return static_cast<StatusCode>(status);
}
StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const cl_half alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
auto status = clblast::Trsm(static_cast<clblast::Layout>(layout),
static_cast<clblast::Side>(side),
static_cast<clblast::Triangle>(triangle),
static_cast<clblast::Transpose>(a_transpose),
static_cast<clblast::Diagonal>(diagonal),
m, n,
alpha,
a_buffer, a_offset, a_ld,
b_buffer, b_offset, b_ld,
queue, event);
return static_cast<StatusCode>(status);
}
// =================================================================================================

View file

@ -29,15 +29,15 @@ namespace clblast {
// Initializes the database
const std::vector<Database::DatabaseEntry> Database::database = {
XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
};
// =================================================================================================

View file

@ -19,11 +19,16 @@ R"(
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this file is used outside of the CLBlast library.
#ifndef PRECISION
#define PRECISION 32 // Data-types: single or double precision, complex or regular
#define PRECISION 32 // Data-types: half, single or double precision, complex or regular
#endif
// =================================================================================================
// Enable support for double-precision
#if PRECISION == 16
#pragma OPENCL EXTENSION cl_khr_fp16: enable
#endif
// Enable support for double-precision
#if PRECISION == 64 || PRECISION == 6464
#if __OPENCL_VERSION__ <= CL_VERSION_1_1
@ -31,8 +36,19 @@ R"(
#endif
#endif
// Half-precision
#if PRECISION == 16
typedef half real;
typedef half2 real2;
typedef half4 real4;
typedef half8 real8;
typedef half16 real16;
#define ZERO 0
#define ONE 1
#define SMALLEST -1.0e14
// Single-precision
#if PRECISION == 32
#elif PRECISION == 32
typedef float real;
typedef float2 real2;
typedef float4 real4;
@ -68,7 +84,7 @@ R"(
#define ONE 1.0f
#define SMALLEST -1.0e37f
// Complex Double-precision
// Complex double-precision
#elif PRECISION == 6464
typedef struct cdouble {double x; double y;} real;
typedef struct cdouble2 {real x; real y;} real2;

View file

@ -23,9 +23,10 @@ R"(
// Full version of the kernel with offsets and strided accesses
__attribute__((reqd_work_group_size(WGS, 1, 1)))
__kernel void Xaxpy(const int n, const real alpha,
__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc) {
const real alpha = arg_alpha[0];
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
#pragma unroll
@ -40,9 +41,11 @@ __kernel void Xaxpy(const int n, const real alpha,
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
// dividable by 'VW', 'WGS' and 'WPT'.
__attribute__((reqd_work_group_size(WGS, 1, 1)))
__kernel void XaxpyFast(const int n, const real alpha,
__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
const __global realV* restrict xgm,
__global realV* ygm) {
const real alpha = arg_alpha[0];
#pragma unroll
for (int w=0; w<WPT; ++w) {
const int id = w*get_global_size(0) + get_global_id(0);

View file

@ -211,13 +211,17 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
// Full version of the kernel
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
__kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
__kernel void Xgemv(const int m, const int n,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const int a_rotated,
const __global real* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl, const int ku) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
// Local memory for the vector X
__local real xlm[WGS1];

View file

@ -95,13 +95,18 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
// --> 'a_rotated' is 0
// --> 'do_conjugate' is 0
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
__kernel void XgemvFast(const int m, const int n,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const int a_rotated,
const __global realVF* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl, const int ku) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
// Local memory for the vector X
__local real xlm[WGS2];
@ -192,13 +197,18 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
// --> 'a_rotated' is 1
// --> 'do_conjugate' is 0
__attribute__((reqd_work_group_size(WGS3, 1, 1)))
__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
__kernel void XgemvFastRot(const int m, const int n,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const int a_rotated,
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* ygm, const int y_offset, const int y_inc,
const int do_conjugate, const int parameter,
const int kl, const int ku) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
// Local memory for the vector X
__local real xlm[WGS3];

View file

@ -19,11 +19,13 @@ R"(
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
__kernel void Xger(const int max1, const int max2, const real alpha,
__kernel void Xger(const int max1, const int max2,
const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* ygm, const int y_offset, const int y_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_rowmajor) {
const real alpha = arg_alpha[0];
// Register storage for X and Y
real xvalues[WPT];

View file

@ -19,10 +19,12 @@ R"(
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
__kernel void Xher(const int n, const real alpha,
__kernel void Xher(const int n,
const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_upper, const int is_rowmajor) {
const real alpha = arg_alpha[0];
// Register storage for X and XT
real xvalues[WPT];

View file

@ -19,11 +19,13 @@ R"(
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
__kernel void Xher2(const int n, const real alpha,
__kernel void Xher2(const int n,
const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_upper, const int is_rowmajor) {
const real alpha = arg_alpha[0];
// Register storage for X and Y
real xvalues[WPT];

View file

@ -267,10 +267,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
// Main entry point of the kernel. This is the upper-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
// Skip these threads if they do not contain threads contributing to the upper-triangle
if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
@ -304,10 +307,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
// Main entry point of the kernel. This is the lower-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void XgemmLower(const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
// Skip these threads if they do not contain threads contributing to the lower-triangle
if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
@ -345,10 +351,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
// Main entry point of the kernel. This is the regular full version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __constant real* restrict arg_alpha,
const __constant real* restrict arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
const real alpha = arg_alpha[0];
const real beta = arg_beta[0];
// Allocates workgroup-private memory (local memory)
#if SA == 1

View file

@ -406,6 +406,7 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
// =================================================================================================
// Compiles the templated class
template class Routine<half>;
template class Routine<float>;
template class Routine<double>;
template class Routine<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xamax<half>::precision_ = Precision::kHalf;
template <> const Precision Xamax<float>::precision_ = Precision::kSingle;
template <> const Precision Xamax<double>::precision_ = Precision::kDouble;
template <> const Precision Xamax<float2>::precision_ = Precision::kComplexSingle;
@ -103,6 +104,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
// =================================================================================================
// Compiles the templated class
template class Xamax<half>;
template class Xamax<float>;
template class Xamax<double>;
template class Xamax<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xasum<half>::precision_ = Precision::kHalf;
template <> const Precision Xasum<float>::precision_ = Precision::kSingle;
template <> const Precision Xasum<double>::precision_ = Precision::kDouble;
template <> const Precision Xasum<float2>::precision_ = Precision::kComplexSingle;
@ -100,6 +101,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
// =================================================================================================
// Compiles the templated class
template class Xasum<half>;
template class Xasum<float>;
template class Xasum<double>;
template class Xasum<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xaxpy<half>::precision_ = Precision::kHalf;
template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
@ -67,16 +68,20 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, kernel_name);
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
// Sets the kernel arguments
if (use_fast_kernel) {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(1, alpha_buffer());
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, y_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(1, alpha_buffer());
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
@ -107,6 +112,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
// =================================================================================================
// Compiles the templated class
template class Xaxpy<half>;
template class Xaxpy<float>;
template class Xaxpy<double>;
template class Xaxpy<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xcopy<half>::precision_ = Precision::kHalf;
template <> const Precision Xcopy<float>::precision_ = Precision::kSingle;
template <> const Precision Xcopy<double>::precision_ = Precision::kDouble;
template <> const Precision Xcopy<float2>::precision_ = Precision::kComplexSingle;
@ -105,6 +106,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
// =================================================================================================
// Compiles the templated class
template class Xcopy<half>;
template class Xcopy<float>;
template class Xcopy<double>;
template class Xcopy<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xdot<half>::precision_ = Precision::kHalf;
template <> const Precision Xdot<float>::precision_ = Precision::kSingle;
template <> const Precision Xdot<double>::precision_ = Precision::kDouble;
template <> const Precision Xdot<float2>::precision_ = Precision::kComplexSingle;
@ -108,6 +109,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
// =================================================================================================
// Compiles the templated class
template class Xdot<half>;
template class Xdot<float>;
template class Xdot<double>;
template class Xdot<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xnrm2<half>::precision_ = Precision::kHalf;
template <> const Precision Xnrm2<float>::precision_ = Precision::kSingle;
template <> const Precision Xnrm2<double>::precision_ = Precision::kDouble;
template <> const Precision Xnrm2<float2>::precision_ = Precision::kComplexSingle;
@ -100,6 +101,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
// =================================================================================================
// Compiles the templated class
template class Xnrm2<half>;
template class Xnrm2<float>;
template class Xnrm2<double>;
template class Xnrm2<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xscal<half>::precision_ = Precision::kHalf;
template <> const Precision Xscal<float>::precision_ = Precision::kSingle;
template <> const Precision Xscal<double>::precision_ = Precision::kDouble;
template <> const Precision Xscal<float2>::precision_ = Precision::kComplexSingle;
@ -99,6 +100,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
// =================================================================================================
// Compiles the templated class
template class Xscal<half>;
template class Xscal<float>;
template class Xscal<double>;
template class Xscal<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xswap<half>::precision_ = Precision::kHalf;
template <> const Precision Xswap<float>::precision_ = Precision::kSingle;
template <> const Precision Xswap<double>::precision_ = Precision::kDouble;
template <> const Precision Xswap<float2>::precision_ = Precision::kComplexSingle;
@ -105,6 +106,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
// =================================================================================================
// Compiles the templated class
template class Xswap<half>;
template class Xswap<float>;
template class Xswap<double>;
template class Xswap<float2>;

View file

@ -58,6 +58,7 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
// =================================================================================================
// Compiles the templated class
template class Xgbmv<half>;
template class Xgbmv<float>;
template class Xgbmv<double>;
template class Xgbmv<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xgemv<half>::precision_ = Precision::kHalf;
template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
@ -134,6 +135,12 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
local_size = db_["WGS3"];
}
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
beta_buffer.Write(queue_, 1, &beta);
// Retrieves the Xgemv kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
@ -142,8 +149,8 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(m_real));
kernel.SetArgument(1, static_cast<int>(n_real));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, beta);
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, static_cast<int>(a_rotated));
kernel.SetArgument(5, a_buffer());
kernel.SetArgument(6, static_cast<int>(a_offset));
@ -173,6 +180,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
// =================================================================================================
// Compiles the templated class
template class Xgemv<half>;
template class Xgemv<float>;
template class Xgemv<double>;
template class Xgemv<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xger<half>::precision_ = Precision::kHalf;
template <> const Precision Xger<float>::precision_ = Precision::kSingle;
template <> const Precision Xger<double>::precision_ = Precision::kDouble;
template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
@ -64,7 +65,11 @@ StatusCode Xger<T>::DoGer(const Layout layout,
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
// Retrieves the Xgemv kernel from the compiled binary
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, "Xger");
@ -72,7 +77,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(a_one));
kernel.SetArgument(1, static_cast<int>(a_two));
kernel.SetArgument(2, alpha);
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, x_buffer());
kernel.SetArgument(4, static_cast<int>(x_offset));
kernel.SetArgument(5, static_cast<int>(x_inc));
@ -100,6 +105,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
// =================================================================================================
// Compiles the templated class
template class Xger<half>;
template class Xger<float>;
template class Xger<double>;
template class Xger<float2>;

View file

@ -19,6 +19,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xher<half, half>::precision_ = Precision::kHalf;
template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
@ -43,6 +44,7 @@ template <> float2 Xher<float2,float>::GetAlpha(const float alpha) { return floa
template <> double2 Xher<double2,double>::GetAlpha(const double alpha) { return double2{alpha, 0.0}; }
template <> float Xher<float,float>::GetAlpha(const float alpha) { return alpha; }
template <> double Xher<double,double>::GetAlpha(const double alpha) { return alpha; }
template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }
// =================================================================================================
@ -63,9 +65,6 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
const auto is_rowmajor = (layout == Layout::kRowMajor);
// Creates a matching version of alpha
const auto matching_alpha = GetAlpha(alpha);
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
@ -77,14 +76,21 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
// If alpha is zero an update is not required
if (alpha == U{0}) { return StatusCode::kSuccess; }
// Retrieves the Xgemv kernel from the compiled binary
// Creates a matching version of alpha
const auto matching_alpha = GetAlpha(alpha);
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &matching_alpha);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, "Xher");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, matching_alpha);
kernel.SetArgument(1, alpha_buffer());
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
@ -110,6 +116,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xher<half, half>;
template class Xher<float, float>;
template class Xher<double, double>;
template class Xher<float2, float>;

View file

@ -19,6 +19,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xher2<half>::precision_ = Precision::kHalf;
template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
@ -66,14 +67,18 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
// Retrieves the Xgemv kernel from the compiled binary
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
// Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, "Xher2");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
kernel.SetArgument(1, alpha);
kernel.SetArgument(1, alpha_buffer());
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
@ -102,6 +107,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xher2<half>;
template class Xher2<float>;
template class Xher2<double>;
template class Xher2<float2>;

View file

@ -57,6 +57,7 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xsbmv<half>;
template class Xsbmv<float>;
template class Xsbmv<double>;

View file

@ -57,6 +57,7 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xspmv<half>;
template class Xspmv<float>;
template class Xspmv<double>;

View file

@ -44,6 +44,7 @@ StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xspr<half>;
template class Xspr<float>;
template class Xspr<double>;

View file

@ -46,6 +46,7 @@ StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xspr2<half>;
template class Xspr2<float>;
template class Xspr2<double>;

View file

@ -57,6 +57,7 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xsymv<half>;
template class Xsymv<float>;
template class Xsymv<double>;

View file

@ -43,6 +43,7 @@ StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xsyr<half>;
template class Xsyr<float>;
template class Xsyr<double>;

View file

@ -45,6 +45,7 @@ StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xsyr2<half>;
template class Xsyr2<float>;
template class Xsyr2<double>;

View file

@ -72,6 +72,7 @@ StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xtbmv<half>;
template class Xtbmv<float>;
template class Xtbmv<double>;
template class Xtbmv<float2>;

View file

@ -72,6 +72,7 @@ StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xtpmv<half>;
template class Xtpmv<float>;
template class Xtpmv<double>;
template class Xtpmv<float2>;

View file

@ -72,6 +72,7 @@ StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
template class Xtrmv<half>;
template class Xtrmv<float>;
template class Xtrmv<double>;
template class Xtrmv<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xgemm<half>::precision_ = Precision::kHalf;
template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
@ -122,6 +123,12 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
beta_buffer.Write(queue_, 1, &beta);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@ -169,8 +176,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
kernel.SetArgument(0, static_cast<int>(m_ceiled));
kernel.SetArgument(1, static_cast<int>(n_ceiled));
kernel.SetArgument(2, static_cast<int>(k_ceiled));
kernel.SetArgument(3, alpha);
kernel.SetArgument(4, beta);
kernel.SetArgument(3, alpha_buffer());
kernel.SetArgument(4, beta_buffer());
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, b_temp());
kernel.SetArgument(7, c_temp());
@ -207,6 +214,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// =================================================================================================
// Compiles the templated class
template class Xgemm<half>;
template class Xgemm<float>;
template class Xgemm<double>;
template class Xgemm<float2>;

View file

@ -112,6 +112,13 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
auto complex_beta = T{beta, static_cast<U>(0.0)};
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
beta_buffer.Write(queue_, 1, &complex_beta);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@ -171,11 +178,10 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
auto complex_beta = T{beta, static_cast<U>(0.0)};
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, complex_beta);
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, a1_temp());
kernel.SetArgument(5, b2_temp());
kernel.SetArgument(6, c_temp());
@ -196,8 +202,10 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
kernel.SetArgument(2, conjugate_alpha);
kernel.SetArgument(3, complex_one);
alpha_buffer.Write(queue_, 1, &conjugate_alpha);
beta_buffer.Write(queue_, 1, &complex_one);
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, b1_temp());
kernel.SetArgument(5, a2_temp());

View file

@ -103,6 +103,14 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
auto complex_alpha = T{alpha, static_cast<U>(0.0)};
auto complex_beta = T{beta, static_cast<U>(0.0)};
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &complex_alpha);
beta_buffer.Write(queue_, 1, &complex_beta);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@ -144,12 +152,10 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
auto complex_alpha = T{alpha, static_cast<U>(0.0)};
auto complex_beta = T{beta, static_cast<U>(0.0)};
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, complex_alpha);
kernel.SetArgument(3, complex_beta);
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());

View file

@ -127,6 +127,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
// =================================================================================================
// Compiles the templated class
template class Xsymm<half>;
template class Xsymm<float>;
template class Xsymm<double>;
template class Xsymm<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xsyr2k<half>::precision_ = Precision::kHalf;
template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
@ -104,6 +105,12 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
beta_buffer.Write(queue_, 1, &beta);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@ -147,8 +154,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, beta);
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
@ -168,7 +175,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
auto one = static_cast<T>(1);
kernel.SetArgument(3, one);
beta_buffer.Write(queue_, 1, &one);
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, b_temp());
kernel.SetArgument(5, a_temp());
@ -196,6 +204,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// =================================================================================================
// Compiles the templated class
template class Xsyr2k<half>;
template class Xsyr2k<float>;
template class Xsyr2k<double>;
template class Xsyr2k<float2>;

View file

@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
template <> const Precision Xsyrk<half>::precision_ = Precision::kHalf;
template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
@ -97,6 +98,12 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
auto alpha_buffer = Buffer<T>(context_, 1);
auto beta_buffer = Buffer<T>(context_, 1);
alpha_buffer.Write(queue_, 1, &alpha);
beta_buffer.Write(queue_, 1, &beta);
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@ -131,8 +138,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
kernel.SetArgument(2, alpha);
kernel.SetArgument(3, beta);
kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, c_temp());
@ -169,6 +176,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// =================================================================================================
// Compiles the templated class
template class Xsyrk<half>;
template class Xsyrk<float>;
template class Xsyrk<double>;
template class Xsyrk<float2>;

View file

@ -130,6 +130,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
// =================================================================================================
// Compiles the templated class
template class Xtrmm<half>;
template class Xtrmm<float>;
template class Xtrmm<double>;
template class Xtrmm<float2>;

View file

@ -107,7 +107,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneCopy<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneCopy<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneCopy<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneCopy<float2>, float2>(argc, argv); break;

View file

@ -85,17 +85,17 @@ class TunePad {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(0);
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(0);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentScalar(0);
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(0);
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(0);
tuner.AddArgumentOutput(b_mat);
tuner.AddArgumentScalar(0);
}
// Describes how to compute the performance metrics
@ -115,7 +115,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePad<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePad<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePad<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePad<float2>, float2>(argc, argv); break;

View file

@ -119,7 +119,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePadTranspose<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePadTranspose<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePadTranspose<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePadTranspose<float2>, float2>(argc, argv); break;

View file

@ -112,7 +112,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneTranspose<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneTranspose<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneTranspose<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneTranspose<float2>, float2>(argc, argv); break;

View file

@ -89,8 +89,9 @@ class TuneXaxpy {
std::vector<T> &x_vec, std::vector<T> &y_vec,
std::vector<T> &, std::vector<T> &, std::vector<T> &,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(args.alpha);
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentInput(x_vec);
tuner.AddArgumentOutput(y_vec);
}
@ -112,7 +113,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXaxpy<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXaxpy<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXaxpy<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXaxpy<float2>, float2>(argc, argv); break;

View file

@ -119,7 +119,7 @@ using double2 = clblast::double2;
template <int V>
void StartVariation(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXdot<half, V>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXdot<float, V>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXdot<double, V>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXdot<float2, V>, float2>(argc, argv); break;

View file

@ -121,11 +121,13 @@ class TuneXgemm {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
auto beta_buffer = std::vector<T>{args.beta};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.k));
tuner.AddArgumentScalar(args.alpha);
tuner.AddArgumentScalar(args.beta);
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentInput(beta_buffer);
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentInput(b_mat);
tuner.AddArgumentOutput(c_mat);
@ -148,7 +150,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2>, float2>(argc, argv); break;

View file

@ -96,11 +96,13 @@ class TuneXgemv {
std::vector<T> &x_vec, std::vector<T> &y_vec,
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
auto beta_buffer = std::vector<T>{args.beta};
auto a_rotated = (V==3) ? 1 : 0;
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(args.alpha);
tuner.AddArgumentScalar(args.beta);
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentInput(beta_buffer);
tuner.AddArgumentScalar(static_cast<int>(a_rotated));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentScalar(0);
@ -135,7 +137,7 @@ using double2 = clblast::double2;
template <int V>
void StartVariation(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemv<half,V>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemv<float,V>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemv<double,V>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemv<float2,V>, float2>(argc, argv); break;

View file

@ -85,9 +85,10 @@ class TuneXger {
std::vector<T> &x_vec, std::vector<T> &y_vec,
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
std::vector<T> &) {
auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(args.alpha);
tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentInput(x_vec);
tuner.AddArgumentScalar(0); // x_offset
tuner.AddArgumentScalar(1); // x_increment
@ -117,7 +118,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXger<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXger<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXger<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXger<float2>, float2>(argc, argv); break;

View file

@ -22,6 +22,56 @@
namespace clblast {
// =================================================================================================
// Returns a scalar with a default value
template <typename T>
T GetScalar() {
return static_cast<T>(2.0);
}
template float GetScalar<float>();
template double GetScalar<double>();
// Specialized version of the above for half-precision
template <>
half GetScalar() {
return FloatToHalf(2.0f);
}
// Specialized versions of the above for complex data-types
template <>
float2 GetScalar() {
return {2.0f, 0.5f};
}
template <>
double2 GetScalar() {
return {2.0, 0.5};
}
// Returns a scalar of value 1
template <typename T>
T ConstantOne() {
return static_cast<T>(1.0);
}
template float ConstantOne<float>();
template double ConstantOne<double>();
// Specialized version of the above for half-precision
template <>
half ConstantOne() {
return FloatToHalf(1.0f);
}
// Specialized versions of the above for complex data-types
template <>
float2 ConstantOne() {
return {1.0f, 0.0f};
}
template <>
double2 ConstantOne() {
return {1.0, 0.0};
}
// =================================================================================================
// Implements the string conversion using std::to_string if possible
template <typename T>
std::string ToString(T value) {
@ -48,6 +98,12 @@ std::string ToString(double2 value) {
return real.str()+"+"+imag.str()+"i";
}
// If not possible directly: special case for half-precision
template <>
std::string ToString(half value) {
return std::to_string(HalfToFloat(value));
}
// If not possible directly: special cases for CLBlast data-types
template <>
std::string ToString(Layout value) {
@ -105,6 +161,9 @@ template <typename T>
T ConvertArgument(const char* value) {
return static_cast<T>(std::stoi(value));
}
template <> half ConvertArgument(const char* value) {
return FloatToHalf(static_cast<float>(std::stod(value)));
}
template <> float ConvertArgument(const char* value) {
return static_cast<float>(std::stod(value));
}
@ -147,6 +206,7 @@ T GetArgument(const int argc, char *argv[], std::string &help,
// Compiles the above function
template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int);
template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t);
template half GetArgument<half>(const int, char **, std::string&, const std::string&, const half);
template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float);
template double GetArgument<double>(const int, char **, std::string&, const std::string&, const double);
template float2 GetArgument<float2>(const int, char **, std::string&, const std::string&, const float2);
@ -227,24 +287,49 @@ void PopulateVector(std::vector<double2> &vector) {
for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
}
// Specialized versions of the above for half-precision
template <>
void PopulateVector(std::vector<half> &vector) {
const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
std::mt19937 mt(GetRandomSeed());
std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
}
// =================================================================================================
// Returns a scalar with a default value
template <typename T>
T GetScalar() {
return static_cast<T>(2.0);
// Conversion between half and single-precision
std::vector<float> HalfToFloatBuffer(const std::vector<half>& source) {
auto result = std::vector<float>(source.size());
for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); }
return result;
}
void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source) {
for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); }
}
template float GetScalar<float>();
template double GetScalar<double>();
// Specialized versions of the above for complex data-types
template <>
float2 GetScalar() {
return {2.0f, 0.5f};
// As above, but now for OpenCL data-types instead of std::vectors
Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw) {
const auto size = source.GetSize() / sizeof(half);
auto queue = Queue(queue_raw);
auto context = queue.GetContext();
auto source_cpu = std::vector<half>(size);
source.Read(queue, size, source_cpu);
auto result_cpu = HalfToFloatBuffer(source_cpu);
auto result = Buffer<float>(context, size);
result.Write(queue, size, result_cpu);
return result;
}
template <>
double2 GetScalar() {
return {2.0, 0.5};
void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw) {
const auto size = source.GetSize() / sizeof(float);
auto queue = Queue(queue_raw);
auto context = queue.GetContext();
auto source_cpu = std::vector<float>(size);
source.Read(queue, size, source_cpu);
auto result_cpu = std::vector<half>(size);
FloatToHalfBuffer(result_cpu, source_cpu);
result.Write(queue, size, result_cpu);
}
// =================================================================================================
@ -288,6 +373,10 @@ template <> bool PrecisionSupported<double2>(const Device &device) {
auto extensions = device.Capabilities();
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
}
template <> bool PrecisionSupported<half>(const Device &device) {
auto extensions = device.Capabilities();
return (extensions.find(kKhronosHalfPrecision) == std::string::npos) ? false : true;
}
// =================================================================================================
} // namespace clblast

View file

@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXamax<double>, double, double>(argc, argv, true, "iDAMAX");
clblast::RunTests<clblast::TestXamax<float2>, float2, float2>(argc, argv, true, "iCAMAX");
clblast::RunTests<clblast::TestXamax<double2>, double2, double2>(argc, argv, true, "iZAMAX");
clblast::RunTests<clblast::TestXamax<half>, half, half>(argc, argv, true, "iHAMAX");
return 0;
}

View file

@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXasum<double>, double, double>(argc, argv, true, "DASUM");
clblast::RunTests<clblast::TestXasum<float2>, float2, float2>(argc, argv, true, "ScASUM");
clblast::RunTests<clblast::TestXasum<double2>, double2, double2>(argc, argv, true, "DzASUM");
clblast::RunTests<clblast::TestXasum<half>, half, half>(argc, argv, true, "HASUM");
return 0;
}

View file

@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXaxpy<double>, double, double>(argc, argv, true, "DAXPY");
clblast::RunTests<clblast::TestXaxpy<float2>, float2, float2>(argc, argv, true, "CAXPY");
clblast::RunTests<clblast::TestXaxpy<double2>, double2, double2>(argc, argv, true, "ZAXPY");
clblast::RunTests<clblast::TestXaxpy<half>, half, half>(argc, argv, true, "HAXPY");
return 0;
}

View file

@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXcopy<double>, double, double>(argc, argv, true, "DCOPY");
clblast::RunTests<clblast::TestXcopy<float2>, float2, float2>(argc, argv, true, "CCOPY");
clblast::RunTests<clblast::TestXcopy<double2>, double2, double2>(argc, argv, true, "ZCOPY");
clblast::RunTests<clblast::TestXcopy<half>, half, half>(argc, argv, true, "HCOPY");
return 0;
}

View file

@ -20,6 +20,7 @@ using double2 = clblast::double2;
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXdot<float>, float, float>(argc, argv, false, "SDOT");
clblast::RunTests<clblast::TestXdot<double>, double, double>(argc, argv, true, "DDOT");
clblast::RunTests<clblast::TestXdot<half>, half, half>(argc, argv, true, "HDOT");
return 0;
}

View file

@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXnrm2<double>, double, double>(argc, argv, true, "DNRM2");
clblast::RunTests<clblast::TestXnrm2<float2>, float2, float2>(argc, argv, true, "ScNRM2");
clblast::RunTests<clblast::TestXnrm2<double2>, double2, double2>(argc, argv, true, "DzNRM2");
clblast::RunTests<clblast::TestXnrm2<half>, half, half>(argc, argv, true, "HNRM2");
return 0;
}

View file

@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXscal<double>, double, double>(argc, argv, true, "DSCAL");
clblast::RunTests<clblast::TestXscal<float2>, float2, float2>(argc, argv, true, "CSCAL");
clblast::RunTests<clblast::TestXscal<double2>, double2, double2>(argc, argv, true, "ZSCAL");
clblast::RunTests<clblast::TestXscal<half>, half, half>(argc, argv, true, "HSCAL");
return 0;
}

View file

@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXswap<double>, double, double>(argc, argv, true, "DSWAP");
clblast::RunTests<clblast::TestXswap<float2>, float2, float2>(argc, argv, true, "CSWAP");
clblast::RunTests<clblast::TestXswap<double2>, double2, double2>(argc, argv, true, "ZSWAP");
clblast::RunTests<clblast::TestXswap<half>, half, half>(argc, argv, true, "HSWAP");
return 0;
}

View file

@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXgbmv<double>, double, double>(argc, argv, true, "DGBMV");
clblast::RunTests<clblast::TestXgbmv<float2>, float2, float2>(argc, argv, true, "CGBMV");
clblast::RunTests<clblast::TestXgbmv<double2>, double2, double2>(argc, argv, true, "ZGBMV");
clblast::RunTests<clblast::TestXgbmv<half>, half, half>(argc, argv, true, "HGBMV");
return 0;
}

View file

@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXgemv<double>, double, double>(argc, argv, true, "DGEMV");
clblast::RunTests<clblast::TestXgemv<float2>, float2, float2>(argc, argv, true, "CGEMV");
clblast::RunTests<clblast::TestXgemv<double2>, double2, double2>(argc, argv, true, "ZGEMV");
clblast::RunTests<clblast::TestXgemv<half>, half, half>(argc, argv, true, "HGEMV");
return 0;
}

View file

@ -20,6 +20,7 @@ using double2 = clblast::double2;
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXger<float>, float, float>(argc, argv, false, "SGER");
clblast::RunTests<clblast::TestXger<double>, double, double>(argc, argv, true, "DGER");
clblast::RunTests<clblast::TestXger<half>, half, half>(argc, argv, true, "HGER");
return 0;
}

View file

@ -20,6 +20,7 @@ using double2 = clblast::double2;
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXsbmv<float>, float, float>(argc, argv, false, "SSBMV");
clblast::RunTests<clblast::TestXsbmv<double>, double, double>(argc, argv, true, "DSBMV");
clblast::RunTests<clblast::TestXsbmv<half>, half, half>(argc, argv, true, "HSBMV");
return 0;
}

View file

@ -20,6 +20,7 @@ using double2 = clblast::double2;
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXspmv<float>, float, float>(argc, argv, false, "SSPMV");
clblast::RunTests<clblast::TestXspmv<double>, double, double>(argc, argv, true, "DSPMV");
clblast::RunTests<clblast::TestXspmv<half>, half, half>(argc, argv, true, "HSPMV");
return 0;
}

View file

@ -20,6 +20,7 @@ using double2 = clblast::double2;
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXspr<float>, float, float>(argc, argv, false, "SSPR");
clblast::RunTests<clblast::TestXspr<double>, double, double>(argc, argv, true, "DSPR");
clblast::RunTests<clblast::TestXspr<half>, half, half>(argc, argv, true, "HSPR");
return 0;
}

View file

@ -20,6 +20,7 @@ using double2 = clblast::double2;
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXspr2<float>, float, float>(argc, argv, false, "SSPR2");
clblast::RunTests<clblast::TestXspr2<double>, double, double>(argc, argv, true, "DSPR2");
clblast::RunTests<clblast::TestXspr2<half>, half, half>(argc, argv, true, "HSPR2");
return 0;
}

View file

@ -20,6 +20,7 @@ using double2 = clblast::double2;
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXsymv<float>, float, float>(argc, argv, false, "SSYMV");
clblast::RunTests<clblast::TestXsymv<double>, double, double>(argc, argv, true, "DSYMV");
clblast::RunTests<clblast::TestXsymv<half>, half, half>(argc, argv, true, "HSYMV");
return 0;
}

View file

@ -20,6 +20,7 @@ using double2 = clblast::double2;
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXsyr<float>, float, float>(argc, argv, false, "SSYR");
clblast::RunTests<clblast::TestXsyr<double>, double, double>(argc, argv, true, "DSYR");
clblast::RunTests<clblast::TestXsyr<half>, half, half>(argc, argv, true, "HSYR");
return 0;
}

View file

@ -20,6 +20,7 @@ using double2 = clblast::double2;
int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXsyr2<float>, float, float>(argc, argv, false, "SSYR2");
clblast::RunTests<clblast::TestXsyr2<double>, double, double>(argc, argv, true, "DSYR2");
clblast::RunTests<clblast::TestXsyr2<half>, half, half>(argc, argv, true, "HSYR2");
return 0;
}

View file

@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
clblast::RunTests<clblast::TestXtbmv<double>, double, double>(argc, argv, true, "DTBMV");
clblast::RunTests<clblast::TestXtbmv<float2>, float2, float2>(argc, argv, true, "CTBMV");
clblast::RunTests<clblast::TestXtbmv<double2>, double2, double2>(argc, argv, true, "ZTBMV");
clblast::RunTests<clblast::TestXtbmv<half>, half, half>(argc, argv, true, "HTBMV");
return 0;
}

Some files were not shown because too many files have changed in this diff Show more