mirror of
https://github.com/CNugteren/CLBlast.git
synced 2024-07-07 12:23:46 +02:00
Merge branch 'half_precision' into development
This commit is contained in:
commit
61105e3810
11
CHANGELOG
11
CHANGELOG
|
@ -1,6 +1,15 @@
|
|||
|
||||
Development version (next release)
|
||||
-
|
||||
- Added support for half-precision floating-point (fp16) in the library
|
||||
- Added half-precision routines:
|
||||
* Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN
|
||||
* Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV/HGER/HSYR/HSPR/HSYR2/HSPR2
|
||||
* Level-3: HGEMM/HSYMM/HSYRK/HSYR2K/HTRMM
|
||||
|
||||
Version 0.7.1
|
||||
- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs
|
||||
- Fixed a bug in the xGEMM routine related to the event incorrectly set
|
||||
- Made MSVC link the run-time libraries statically
|
||||
|
||||
Version 0.7.1
|
||||
- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs
|
||||
|
|
|
@ -125,7 +125,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
|
|||
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
|
||||
set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
|
||||
set(SAMPLE_PROGRAMS_CPP sgemm)
|
||||
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm cache)
|
||||
set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
|
||||
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
|
||||
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
|
||||
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
|
||||
|
@ -156,6 +156,7 @@ target_link_libraries(clblast ${OPENCL_LIBRARIES})
|
|||
install(TARGETS clblast DESTINATION lib)
|
||||
install(FILES include/clblast.h DESTINATION include)
|
||||
install(FILES include/clblast_c.h DESTINATION include)
|
||||
install(FILES include/clblast_half.h DESTINATION include)
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
|
|
127
README.md
127
README.md
|
@ -20,6 +20,7 @@ Use CLBlast instead of clBLAS:
|
|||
* When you are still running on OpenCL 1.1 hardware.
|
||||
* When you value an organized and modern C++ codebase.
|
||||
* When you target Intel CPUs and GPUs or embedded devices
|
||||
* When you can benefit from the increased performance of half-precision fp16 data-types.
|
||||
|
||||
Use CLBlast instead of cuBLAS:
|
||||
|
||||
|
@ -127,7 +128,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
|
|||
|
||||
cmake -DTUNERS=ON ..
|
||||
|
||||
Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 1.7.0 or higher). CLTune is available from GitHub.
|
||||
Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 2.3.1 or higher). CLTune is available from GitHub.
|
||||
|
||||
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
|
||||
|
||||
|
@ -177,64 +178,70 @@ These graphs can be generated automatically on your own device. First, compile C
|
|||
Supported routines
|
||||
-------------
|
||||
|
||||
CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all.
|
||||
CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all. The different data-types supported by the library are:
|
||||
|
||||
| Level-1 | S | D | C | Z |
|
||||
| ---------|---|---|---|---|
|
||||
| xSWAP | ✔ | ✔ | ✔ | ✔ |
|
||||
| xSCAL | ✔ | ✔ | ✔ | ✔ |
|
||||
| xCOPY | ✔ | ✔ | ✔ | ✔ |
|
||||
| xAXPY | ✔ | ✔ | ✔ | ✔ |
|
||||
| xDOT | ✔ | ✔ | - | - |
|
||||
| xDOTU | - | - | ✔ | ✔ |
|
||||
| xDOTC | - | - | ✔ | ✔ |
|
||||
| xNRM2 | ✔ | ✔ | ✔ | ✔ |
|
||||
| xASUM | ✔ | ✔ | ✔ | ✔ |
|
||||
| IxAMAX | ✔ | ✔ | ✔ | ✔ |
|
||||
* __S:__ Single-precision 32-bit floating-point (`float`).
|
||||
* __D:__ Double-precision 64-bit floating-point (`double`).
|
||||
* __C:__ Complex single-precision 2x32-bit floating-point (`std::complex<float>`).
|
||||
* __Z:__ Complex double-precision 2x64-bit floating-point (`std::complex<double>`).
|
||||
* __H:__ Half-precision 16-bit floating-point (`cl_half`). See section 'Half precision' for more information.
|
||||
|
||||
| Level-2 | S | D | C | Z |
|
||||
| ---------|---|---|---|---|
|
||||
| xGEMV | ✔ | ✔ | ✔ | ✔ |
|
||||
| xGBMV | ✔ | ✔ | ✔ | ✔ |
|
||||
| xHEMV | - | - | ✔ | ✔ |
|
||||
| xHBMV | - | - | ✔ | ✔ |
|
||||
| xHPMV | - | - | ✔ | ✔ |
|
||||
| xSYMV | ✔ | ✔ | - | - |
|
||||
| xSBMV | ✔ | ✔ | - | - |
|
||||
| xSPMV | ✔ | ✔ | - | - |
|
||||
| xTRMV | ✔ | ✔ | ✔ | ✔ |
|
||||
| xTBMV | ✔ | ✔ | ✔ | ✔ |
|
||||
| xTPMV | ✔ | ✔ | ✔ | ✔ |
|
||||
| xGER | ✔ | ✔ | - | - |
|
||||
| xGERU | - | - | ✔ | ✔ |
|
||||
| xGERC | - | - | ✔ | ✔ |
|
||||
| xHER | - | - | ✔ | ✔ |
|
||||
| xHPR | - | - | ✔ | ✔ |
|
||||
| xHER2 | - | - | ✔ | ✔ |
|
||||
| xHPR2 | - | - | ✔ | ✔ |
|
||||
| xSYR | ✔ | ✔ | - | - |
|
||||
| xSPR | ✔ | ✔ | - | - |
|
||||
| xSYR2 | ✔ | ✔ | - | - |
|
||||
| xSPR2 | ✔ | ✔ | - | - |
|
||||
| Level-1 | S | D | C | Z | H |
|
||||
| ---------|---|---|---|---|---|
|
||||
| xSWAP | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xSCAL | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xCOPY | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xAXPY | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xDOT | ✔ | ✔ | - | - | ✔ |
|
||||
| xDOTU | - | - | ✔ | ✔ | - |
|
||||
| xDOTC | - | - | ✔ | ✔ | - |
|
||||
| xNRM2 | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xASUM | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| IxAMAX | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
|
||||
| Level-3 | S | D | C | Z |
|
||||
| ---------|---|---|---|---|
|
||||
| xGEMM | ✔ | ✔ | ✔ | ✔ |
|
||||
| xSYMM | ✔ | ✔ | ✔ | ✔ |
|
||||
| xHEMM | - | - | ✔ | ✔ |
|
||||
| xSYRK | ✔ | ✔ | ✔ | ✔ |
|
||||
| xHERK | - | - | ✔ | ✔ |
|
||||
| xSYR2K | ✔ | ✔ | ✔ | ✔ |
|
||||
| xHER2K | - | - | ✔ | ✔ |
|
||||
| xTRMM | ✔ | ✔ | ✔ | ✔ |
|
||||
| Level-2 | S | D | C | Z | H |
|
||||
| ---------|---|---|---|---|---|
|
||||
| xGEMV | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xGBMV | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xHEMV | - | - | ✔ | ✔ | - |
|
||||
| xHBMV | - | - | ✔ | ✔ | - |
|
||||
| xHPMV | - | - | ✔ | ✔ | - |
|
||||
| xSYMV | ✔ | ✔ | - | - | ✔ |
|
||||
| xSBMV | ✔ | ✔ | - | - | ✔ |
|
||||
| xSPMV | ✔ | ✔ | - | - | ✔ |
|
||||
| xTRMV | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xTBMV | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xTPMV | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xGER | ✔ | ✔ | - | - | ✔ |
|
||||
| xGERU | - | - | ✔ | ✔ | - |
|
||||
| xGERC | - | - | ✔ | ✔ | - |
|
||||
| xHER | - | - | ✔ | ✔ | - |
|
||||
| xHPR | - | - | ✔ | ✔ | - |
|
||||
| xHER2 | - | - | ✔ | ✔ | - |
|
||||
| xHPR2 | - | - | ✔ | ✔ | - |
|
||||
| xSYR | ✔ | ✔ | - | - | ✔ |
|
||||
| xSPR | ✔ | ✔ | - | - | ✔ |
|
||||
| xSYR2 | ✔ | ✔ | - | - | ✔ |
|
||||
| xSPR2 | ✔ | ✔ | - | - | ✔ |
|
||||
|
||||
| Level-3 | S | D | C | Z | H |
|
||||
| ---------|---|---|---|---|---|
|
||||
| xGEMM | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xSYMM | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xHEMM | - | - | ✔ | ✔ | - |
|
||||
| xSYRK | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xHERK | - | - | ✔ | ✔ | - |
|
||||
| xSYR2K | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| xHER2K | - | - | ✔ | ✔ | - |
|
||||
| xTRMM | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
|
||||
In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care:
|
||||
|
||||
| Additional | S | D | C | Z |
|
||||
| -----------|---|---|---|---|
|
||||
| xSUM | ✔ | ✔ | ✔ | ✔ |
|
||||
| IxMAX | ✔ | ✔ | ✔ | ✔ |
|
||||
| IxMIN | ✔ | ✔ | ✔ | ✔ |
|
||||
| Additional | S | D | C | Z | H |
|
||||
| -----------|---|---|---|---|---|
|
||||
| xSUM | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| IxMAX | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| IxMIN | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
|
||||
Some BLAS routines are not supported yet by CLBlast. They are shown in the following table:
|
||||
|
||||
|
@ -250,6 +257,19 @@ Some BLAS routines are not supported yet by CLBlast. They are shown in the follo
|
|||
| xTRSM | | | | |
|
||||
|
||||
|
||||
Half precision (fp16)
|
||||
-------------
|
||||
|
||||
The half-precison fp16 format is a 16-bits floating-point data-type. Some OpenCL devices support the `cl_khr_fp16` extension, reducing storage and bandwidth requirements by a factor 2 compared to single-precision floating-point. In case the hardware also accelerates arithmetic on half-precision data-types, this can also greatly improve compute performance of e.g. level-3 routines such as GEMM. Devices which can benefit from this are among others Intel GPUs, ARM Mali GPUs, and NVIDIA's latest Pascal GPUs. Half-precision is in particular interest for the deep-learning community, in which convolutional neural networks can be processed much faster at a minor accuracy loss.
|
||||
|
||||
Since there is no half-precision data-type in C or C++, OpenCL provides the `cl_half` type for the host device. Unfortunately, internally this translates to a 16-bits integer, so computations on the host using this data-type should be avoided. For convenience, CLBlast provides the `clblast_half.h` header (C99 and C++ compatible), defining the `half` type as a short-hand to `cl_half` and the following basic functions:
|
||||
|
||||
* `half FloatToHalf(const float value)`: Converts a 32-bits floating-point value to a 16-bits floating-point value.
|
||||
* `float HalfToFloat(const half value)`: Converts a 16-bits floating-point value to a 32-bits floating-point value.
|
||||
|
||||
The `/samples` folder contains examples of how to use these convencience functions when calling one of the half-precision BLAS routines.
|
||||
|
||||
|
||||
Contributing
|
||||
-------------
|
||||
|
||||
|
@ -270,6 +290,7 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
|
|||
* [dividiti](http://www.dividiti.com)
|
||||
* [SURFsara HPC center](http://www.surfsara.com)
|
||||
|
||||
|
||||
Support us
|
||||
-------------
|
||||
|
||||
|
|
171
doc/clblast.md
171
doc/clblast.md
|
@ -34,6 +34,10 @@ StatusCode CLBlastZswap(const size_t n,
|
|||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHswap(const size_t n,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SWAP:
|
||||
|
@ -82,6 +86,10 @@ StatusCode CLBlastZscal(const size_t n,
|
|||
const cl_double2 alpha,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHscal(const size_t n,
|
||||
const cl_half alpha,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SCAL:
|
||||
|
@ -128,6 +136,10 @@ StatusCode CLBlastZcopy(const size_t n,
|
|||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHcopy(const size_t n,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to COPY:
|
||||
|
@ -181,6 +193,11 @@ StatusCode CLBlastZaxpy(const size_t n,
|
|||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHaxpy(const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to AXPY:
|
||||
|
@ -225,6 +242,11 @@ StatusCode CLBlastDdot(const size_t n,
|
|||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHdot(const size_t n,
|
||||
cl_mem dot_buffer, const size_t dot_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to DOT:
|
||||
|
@ -371,6 +393,10 @@ StatusCode CLBlastDznrm2(const size_t n,
|
|||
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHnrm2(const size_t n,
|
||||
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to NRM2:
|
||||
|
@ -420,6 +446,10 @@ StatusCode CLBlastDzasum(const size_t n,
|
|||
cl_mem asum_buffer, const size_t asum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHasum(const size_t n,
|
||||
cl_mem asum_buffer, const size_t asum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to ASUM:
|
||||
|
@ -469,6 +499,10 @@ StatusCode CLBlastDzsum(const size_t n,
|
|||
cl_mem sum_buffer, const size_t sum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHsum(const size_t n,
|
||||
cl_mem sum_buffer, const size_t sum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SUM:
|
||||
|
@ -518,6 +552,10 @@ StatusCode CLBlastiZamax(const size_t n,
|
|||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastiHamax(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to AMAX:
|
||||
|
@ -567,6 +605,10 @@ StatusCode CLBlastiZmax(const size_t n,
|
|||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastiHmax(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to MAX:
|
||||
|
@ -616,6 +658,10 @@ StatusCode CLBlastiZmin(const size_t n,
|
|||
cl_mem imin_buffer, const size_t imin_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastiHmin(const size_t n,
|
||||
cl_mem imin_buffer, const size_t imin_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to MIN:
|
||||
|
@ -685,6 +731,14 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose,
|
|||
const cl_double2 beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to GEMV:
|
||||
|
@ -761,6 +815,14 @@ StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose,
|
|||
const cl_double2 beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n, const size_t kl, const size_t ku,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to GBMV:
|
||||
|
@ -1000,6 +1062,14 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle,
|
|||
const double beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SYMV:
|
||||
|
@ -1059,6 +1129,14 @@ StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle,
|
|||
const double beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SBMV:
|
||||
|
@ -1119,6 +1197,14 @@ StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle,
|
|||
const double beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem ap_buffer, const size_t ap_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SPMV:
|
||||
|
@ -1178,6 +1264,11 @@ StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Tran
|
|||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to TRMV:
|
||||
|
@ -1235,6 +1326,11 @@ StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Tran
|
|||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n, const size_t k,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to TBMV:
|
||||
|
@ -1293,6 +1389,11 @@ StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Tran
|
|||
const cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to TPMV:
|
||||
|
@ -1345,6 +1446,13 @@ StatusCode CLBlastDger(const Layout layout,
|
|||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHger(const Layout layout,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to GER:
|
||||
|
@ -1713,6 +1821,12 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle,
|
|||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SYR:
|
||||
|
@ -1762,6 +1876,12 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle,
|
|||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHspr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SPR:
|
||||
|
@ -1813,6 +1933,13 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle,
|
|||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SYR2:
|
||||
|
@ -1868,6 +1995,13 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle,
|
|||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SPR2:
|
||||
|
@ -1941,6 +2075,14 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const
|
|||
const cl_double2 beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
||||
const size_t m, const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to GEMM:
|
||||
|
@ -2019,6 +2161,14 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri
|
|||
const cl_double2 beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SYMM:
|
||||
|
@ -2152,6 +2302,13 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran
|
|||
const cl_double2 beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SYRK:
|
||||
|
@ -2281,6 +2438,14 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra
|
|||
const cl_double2 beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to SYR2K:
|
||||
|
@ -2409,6 +2574,12 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri
|
|||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event)
|
||||
```
|
||||
|
||||
Arguments to TRMM:
|
||||
|
|
|
@ -121,28 +121,28 @@ StatusCode Rotm(const size_t n,
|
|||
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
|
||||
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
|
||||
template <typename T>
|
||||
StatusCode Swap(const size_t n,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
|
||||
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
|
||||
template <typename T>
|
||||
StatusCode Scal(const size_t n,
|
||||
const T alpha,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
|
||||
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
|
||||
template <typename T>
|
||||
StatusCode Copy(const size_t n,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
|
||||
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
|
||||
template <typename T>
|
||||
StatusCode Axpy(const size_t n,
|
||||
const T alpha,
|
||||
|
@ -150,7 +150,7 @@ StatusCode Axpy(const size_t n,
|
|||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Dot product of two vectors: SDOT/DDOT
|
||||
// Dot product of two vectors: SDOT/DDOT/HDOT
|
||||
template <typename T>
|
||||
StatusCode Dot(const size_t n,
|
||||
cl_mem dot_buffer, const size_t dot_offset,
|
||||
|
@ -174,42 +174,42 @@ StatusCode Dotc(const size_t n,
|
|||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
|
||||
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
|
||||
template <typename T>
|
||||
StatusCode Nrm2(const size_t n,
|
||||
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
|
||||
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
|
||||
template <typename T>
|
||||
StatusCode Asum(const size_t n,
|
||||
cl_mem asum_buffer, const size_t asum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
|
||||
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
|
||||
template <typename T>
|
||||
StatusCode Sum(const size_t n,
|
||||
cl_mem sum_buffer, const size_t sum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
|
||||
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
|
||||
template <typename T>
|
||||
StatusCode Amax(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
|
||||
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
|
||||
template <typename T>
|
||||
StatusCode Max(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
|
||||
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
|
||||
template <typename T>
|
||||
StatusCode Min(const size_t n,
|
||||
cl_mem imin_buffer, const size_t imin_offset,
|
||||
|
@ -220,7 +220,7 @@ StatusCode Min(const size_t n,
|
|||
// BLAS level-2 (matrix-vector) routines
|
||||
// =================================================================================================
|
||||
|
||||
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
|
||||
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
|
||||
template <typename T>
|
||||
StatusCode Gemv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n,
|
||||
|
@ -231,7 +231,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
|
|||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
|
||||
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
|
||||
template <typename T>
|
||||
StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n, const size_t kl, const size_t ku,
|
||||
|
@ -275,7 +275,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle,
|
|||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Symmetric matrix-vector multiplication: SSYMV/DSYMV
|
||||
// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
|
||||
template <typename T>
|
||||
StatusCode Symv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -286,7 +286,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle,
|
|||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
|
||||
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
|
||||
template <typename T>
|
||||
StatusCode Sbmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n, const size_t k,
|
||||
|
@ -297,7 +297,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle,
|
|||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
|
||||
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
|
||||
template <typename T>
|
||||
StatusCode Spmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -308,7 +308,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle,
|
|||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
|
||||
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
|
||||
template <typename T>
|
||||
StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
|
@ -316,7 +316,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_
|
|||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
|
||||
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
|
||||
template <typename T>
|
||||
StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n, const size_t k,
|
||||
|
@ -324,7 +324,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_
|
|||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
|
||||
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
|
||||
template <typename T>
|
||||
StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
|
@ -356,7 +356,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_
|
|||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// General rank-1 matrix update: SGER/DGER
|
||||
// General rank-1 matrix update: SGER/DGER/HGER
|
||||
template <typename T>
|
||||
StatusCode Ger(const Layout layout,
|
||||
const size_t m, const size_t n,
|
||||
|
@ -424,7 +424,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
|
|||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Symmetric rank-1 matrix update: SSYR/DSYR
|
||||
// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
|
||||
template <typename T>
|
||||
StatusCode Syr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -433,7 +433,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
|
|||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Symmetric packed rank-1 matrix update: SSPR/DSPR
|
||||
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
|
||||
template <typename T>
|
||||
StatusCode Spr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -442,7 +442,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
|
|||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Symmetric rank-2 matrix update: SSYR2/DSYR2
|
||||
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
|
||||
template <typename T>
|
||||
StatusCode Syr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -452,7 +452,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
|
|||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
|
||||
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
|
||||
template <typename T>
|
||||
StatusCode Spr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -466,7 +466,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle,
|
|||
// BLAS level-3 (matrix-matrix) routines
|
||||
// =================================================================================================
|
||||
|
||||
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
|
||||
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
|
||||
template <typename T>
|
||||
StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
||||
const size_t m, const size_t n, const size_t k,
|
||||
|
@ -477,7 +477,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
|
|||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
|
||||
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
|
||||
template <typename T>
|
||||
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
||||
const size_t m, const size_t n,
|
||||
|
@ -499,7 +499,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
|
|||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
|
||||
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
|
||||
template <typename T>
|
||||
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||
const size_t n, const size_t k,
|
||||
|
@ -519,7 +519,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
|
|||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
|
||||
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
|
||||
template <typename T>
|
||||
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||
const size_t n, const size_t k,
|
||||
|
@ -541,7 +541,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
|
|||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
|
||||
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
|
||||
template <typename T>
|
||||
StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
|
@ -550,7 +550,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
|
|||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event = nullptr);
|
||||
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
|
||||
template <typename T>
|
||||
StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
|
|
|
@ -148,7 +148,7 @@ StatusCode PUBLIC_API CLBlastDrotm(const size_t n,
|
|||
cl_mem sparam_buffer, const size_t sparam_offset,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
|
||||
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
|
||||
StatusCode PUBLIC_API CLBlastSswap(const size_t n,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
|
@ -165,8 +165,12 @@ StatusCode PUBLIC_API CLBlastZswap(const size_t n,
|
|||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHswap(const size_t n,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
|
||||
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
|
||||
StatusCode PUBLIC_API CLBlastSscal(const size_t n,
|
||||
const float alpha,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -183,8 +187,12 @@ StatusCode PUBLIC_API CLBlastZscal(const size_t n,
|
|||
const cl_double2 alpha,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHscal(const size_t n,
|
||||
const cl_half alpha,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
|
||||
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
|
||||
StatusCode PUBLIC_API CLBlastScopy(const size_t n,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
|
@ -201,8 +209,12 @@ StatusCode PUBLIC_API CLBlastZcopy(const size_t n,
|
|||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHcopy(const size_t n,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
|
||||
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
|
||||
StatusCode PUBLIC_API CLBlastSaxpy(const size_t n,
|
||||
const float alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -223,8 +235,13 @@ StatusCode PUBLIC_API CLBlastZaxpy(const size_t n,
|
|||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHaxpy(const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Dot product of two vectors: SDOT/DDOT
|
||||
// Dot product of two vectors: SDOT/DDOT/HDOT
|
||||
StatusCode PUBLIC_API CLBlastSdot(const size_t n,
|
||||
cl_mem dot_buffer, const size_t dot_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -235,6 +252,11 @@ StatusCode PUBLIC_API CLBlastDdot(const size_t n,
|
|||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHdot(const size_t n,
|
||||
cl_mem dot_buffer, const size_t dot_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Dot product of two complex vectors: CDOTU/ZDOTU
|
||||
StatusCode PUBLIC_API CLBlastCdotu(const size_t n,
|
||||
|
@ -260,7 +282,7 @@ StatusCode PUBLIC_API CLBlastZdotc(const size_t n,
|
|||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
|
||||
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
|
||||
StatusCode PUBLIC_API CLBlastSnrm2(const size_t n,
|
||||
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -277,8 +299,12 @@ StatusCode PUBLIC_API CLBlastDznrm2(const size_t n,
|
|||
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHnrm2(const size_t n,
|
||||
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
|
||||
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
|
||||
StatusCode PUBLIC_API CLBlastSasum(const size_t n,
|
||||
cl_mem asum_buffer, const size_t asum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -295,8 +321,12 @@ StatusCode PUBLIC_API CLBlastDzasum(const size_t n,
|
|||
cl_mem asum_buffer, const size_t asum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHasum(const size_t n,
|
||||
cl_mem asum_buffer, const size_t asum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
|
||||
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
|
||||
StatusCode PUBLIC_API CLBlastSsum(const size_t n,
|
||||
cl_mem sum_buffer, const size_t sum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -313,8 +343,12 @@ StatusCode PUBLIC_API CLBlastDzsum(const size_t n,
|
|||
cl_mem sum_buffer, const size_t sum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHsum(const size_t n,
|
||||
cl_mem sum_buffer, const size_t sum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
|
||||
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
|
||||
StatusCode PUBLIC_API CLBlastiSamax(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -331,8 +365,12 @@ StatusCode PUBLIC_API CLBlastiZamax(const size_t n,
|
|||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastiHamax(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
|
||||
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
|
||||
StatusCode PUBLIC_API CLBlastiSmax(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -349,8 +387,12 @@ StatusCode PUBLIC_API CLBlastiZmax(const size_t n,
|
|||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastiHmax(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
|
||||
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
|
||||
StatusCode PUBLIC_API CLBlastiSmin(const size_t n,
|
||||
cl_mem imin_buffer, const size_t imin_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -367,12 +409,16 @@ StatusCode PUBLIC_API CLBlastiZmin(const size_t n,
|
|||
cl_mem imin_buffer, const size_t imin_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastiHmin(const size_t n,
|
||||
cl_mem imin_buffer, const size_t imin_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// =================================================================================================
|
||||
// BLAS level-2 (matrix-vector) routines
|
||||
// =================================================================================================
|
||||
|
||||
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
|
||||
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
|
||||
StatusCode PUBLIC_API CLBlastSgemv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n,
|
||||
const float alpha,
|
||||
|
@ -405,8 +451,16 @@ StatusCode PUBLIC_API CLBlastZgemv(const Layout layout, const Transpose a_transp
|
|||
const cl_double2 beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHgemv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
|
||||
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
|
||||
StatusCode PUBLIC_API CLBlastSgbmv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n, const size_t kl, const size_t ku,
|
||||
const float alpha,
|
||||
|
@ -439,6 +493,14 @@ StatusCode PUBLIC_API CLBlastZgbmv(const Layout layout, const Transpose a_transp
|
|||
const cl_double2 beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n, const size_t kl, const size_t ku,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
|
||||
StatusCode PUBLIC_API CLBlastChemv(const Layout layout, const Triangle triangle,
|
||||
|
@ -494,7 +556,7 @@ StatusCode PUBLIC_API CLBlastZhpmv(const Layout layout, const Triangle triangle,
|
|||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Symmetric matrix-vector multiplication: SSYMV/DSYMV
|
||||
// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
|
||||
StatusCode PUBLIC_API CLBlastSsymv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const float alpha,
|
||||
|
@ -511,8 +573,16 @@ StatusCode PUBLIC_API CLBlastDsymv(const Layout layout, const Triangle triangle,
|
|||
const double beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHsymv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
|
||||
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
|
||||
StatusCode PUBLIC_API CLBlastSsbmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n, const size_t k,
|
||||
const float alpha,
|
||||
|
@ -529,8 +599,16 @@ StatusCode PUBLIC_API CLBlastDsbmv(const Layout layout, const Triangle triangle,
|
|||
const double beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHsbmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
|
||||
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
|
||||
StatusCode PUBLIC_API CLBlastSspmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const float alpha,
|
||||
|
@ -547,8 +625,16 @@ StatusCode PUBLIC_API CLBlastDspmv(const Layout layout, const Triangle triangle,
|
|||
const double beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHspmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem ap_buffer, const size_t ap_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
|
||||
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
|
||||
StatusCode PUBLIC_API CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
|
@ -569,8 +655,13 @@ StatusCode PUBLIC_API CLBlastZtrmv(const Layout layout, const Triangle triangle,
|
|||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
|
||||
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
|
||||
StatusCode PUBLIC_API CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n, const size_t k,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
|
@ -591,8 +682,13 @@ StatusCode PUBLIC_API CLBlastZtbmv(const Layout layout, const Triangle triangle,
|
|||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n, const size_t k,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
|
||||
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
|
||||
StatusCode PUBLIC_API CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem ap_buffer, const size_t ap_offset,
|
||||
|
@ -613,6 +709,11 @@ StatusCode PUBLIC_API CLBlastZtpmv(const Layout layout, const Triangle triangle,
|
|||
const cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
|
||||
StatusCode PUBLIC_API CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
|
@ -680,7 +781,7 @@ StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle,
|
|||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// General rank-1 matrix update: SGER/DGER
|
||||
// General rank-1 matrix update: SGER/DGER/HGER
|
||||
StatusCode PUBLIC_API CLBlastSger(const Layout layout,
|
||||
const size_t m, const size_t n,
|
||||
const float alpha,
|
||||
|
@ -695,6 +796,13 @@ StatusCode PUBLIC_API CLBlastDger(const Layout layout,
|
|||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHger(const Layout layout,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// General rank-1 complex matrix update: CGERU/ZGERU
|
||||
StatusCode PUBLIC_API CLBlastCgeru(const Layout layout,
|
||||
|
@ -788,7 +896,7 @@ StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle,
|
|||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Symmetric rank-1 matrix update: SSYR/DSYR
|
||||
// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
|
||||
StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const float alpha,
|
||||
|
@ -801,8 +909,14 @@ StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle,
|
|||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHsyr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Symmetric packed rank-1 matrix update: SSPR/DSPR
|
||||
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
|
||||
StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const float alpha,
|
||||
|
@ -815,8 +929,14 @@ StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle,
|
|||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHspr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Symmetric rank-2 matrix update: SSYR2/DSYR2
|
||||
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
|
||||
StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const float alpha,
|
||||
|
@ -831,8 +951,15 @@ StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle,
|
|||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHsyr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
|
||||
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
|
||||
StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const float alpha,
|
||||
|
@ -847,12 +974,19 @@ StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle,
|
|||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHspr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// =================================================================================================
|
||||
// BLAS level-3 (matrix-matrix) routines
|
||||
// =================================================================================================
|
||||
|
||||
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
|
||||
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
|
||||
StatusCode PUBLIC_API CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
||||
const size_t m, const size_t n, const size_t k,
|
||||
const float alpha,
|
||||
|
@ -885,8 +1019,16 @@ StatusCode PUBLIC_API CLBlastZgemm(const Layout layout, const Transpose a_transp
|
|||
const cl_double2 beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
||||
const size_t m, const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
|
||||
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
|
||||
StatusCode PUBLIC_API CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle,
|
||||
const size_t m, const size_t n,
|
||||
const float alpha,
|
||||
|
@ -919,6 +1061,14 @@ StatusCode PUBLIC_API CLBlastZsymm(const Layout layout, const Side side, const T
|
|||
const cl_double2 beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
|
||||
StatusCode PUBLIC_API CLBlastChemm(const Layout layout, const Side side, const Triangle triangle,
|
||||
|
@ -938,7 +1088,7 @@ StatusCode PUBLIC_API CLBlastZhemm(const Layout layout, const Side side, const T
|
|||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
|
||||
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
|
||||
StatusCode PUBLIC_API CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const float alpha,
|
||||
|
@ -967,6 +1117,13 @@ StatusCode PUBLIC_API CLBlastZsyrk(const Layout layout, const Triangle triangle,
|
|||
const cl_double2 beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Rank-K update of a hermitian matrix: CHERK/ZHERK
|
||||
StatusCode PUBLIC_API CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||
|
@ -984,7 +1141,7 @@ StatusCode PUBLIC_API CLBlastZherk(const Layout layout, const Triangle triangle,
|
|||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
|
||||
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
|
||||
StatusCode PUBLIC_API CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const float alpha,
|
||||
|
@ -1017,6 +1174,14 @@ StatusCode PUBLIC_API CLBlastZsyr2k(const Layout layout, const Triangle triangle
|
|||
const cl_double2 beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
|
||||
StatusCode PUBLIC_API CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||
|
@ -1036,7 +1201,7 @@ StatusCode PUBLIC_API CLBlastZher2k(const Layout layout, const Triangle triangle
|
|||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
|
||||
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
|
||||
StatusCode PUBLIC_API CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const float alpha,
|
||||
|
@ -1061,8 +1226,14 @@ StatusCode PUBLIC_API CLBlastZtrmm(const Layout layout, const Side side, const T
|
|||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
|
||||
StatusCode PUBLIC_API CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const float alpha,
|
||||
|
@ -1087,6 +1258,12 @@ StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const T
|
|||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
StatusCode PUBLIC_API CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
|
256
include/clblast_half.h
Normal file
256
include/clblast_half.h
Normal file
|
@ -0,0 +1,256 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file provides simple conversion operations between fp16 (half) and fp32 (float). These
|
||||
// conversion functions are based on ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf and
|
||||
// are also part of the C++ half-precision header (http://half.sourceforge.net/).
|
||||
//
|
||||
// This file is pure C99.
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#ifndef CLBLAST_HALF_H_
|
||||
#define CLBLAST_HALF_H_
|
||||
|
||||
// Includes the normal OpenCL C header
|
||||
#if defined(__APPLE__) || defined(__MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,
|
||||
// which is a typedef for unsigned short.
|
||||
typedef cl_half half;
|
||||
|
||||
// 32-bit union for conversions
|
||||
typedef union ConversionBits_ {
|
||||
unsigned int i32;
|
||||
float f32;
|
||||
} ConversionBits;
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Converts a IEEE-compliant single-precision value to half-precision floating-point. This function
|
||||
// applies simple truncation (round toward zero, but with overflows set to infinity) as rounding
|
||||
// mode.
|
||||
inline half FloatToHalf(const float value) {
|
||||
static const unsigned short base_table[512] = {
|
||||
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
|
||||
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
|
||||
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
|
||||
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
|
||||
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
|
||||
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
|
||||
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
|
||||
0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
|
||||
0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
|
||||
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
|
||||
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
|
||||
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
|
||||
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
|
||||
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
|
||||
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
|
||||
0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
|
||||
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
|
||||
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
|
||||
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
|
||||
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
|
||||
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
|
||||
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
|
||||
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
|
||||
0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
|
||||
0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
|
||||
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
|
||||
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
|
||||
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
|
||||
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
|
||||
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
|
||||
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
|
||||
0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00
|
||||
};
|
||||
static const unsigned char shift_table[512] = {
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13
|
||||
};
|
||||
ConversionBits bits;
|
||||
bits.f32 = value;
|
||||
const unsigned short halfbits = base_table[bits.i32 >> 23] +
|
||||
(unsigned short)((bits.i32 & 0x7FFFFF) >> shift_table[bits.i32 >> 23]);
|
||||
return halfbits;
|
||||
}
|
||||
|
||||
// Converts a half-precision value to IEEE-compliant single-precision floating-point
|
||||
inline float HalfToFloat(const half value) {
|
||||
static const unsigned int mantissa_table[2048] = {
|
||||
0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
|
||||
0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
|
||||
0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,
|
||||
0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,
|
||||
0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,
|
||||
0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,
|
||||
0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
|
||||
0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,
|
||||
0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,
|
||||
0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,
|
||||
0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,
|
||||
0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,
|
||||
0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,
|
||||
0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
|
||||
0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
|
||||
0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,
|
||||
0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,
|
||||
0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,
|
||||
0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,
|
||||
0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,
|
||||
0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
|
||||
0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,
|
||||
0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,
|
||||
0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,
|
||||
0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,
|
||||
0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000,
|
||||
0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,
|
||||
0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
|
||||
0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,
|
||||
0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,
|
||||
0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,
|
||||
0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,
|
||||
0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,
|
||||
0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,
|
||||
0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
|
||||
0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,
|
||||
0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,
|
||||
0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,
|
||||
0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,
|
||||
0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,
|
||||
0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,
|
||||
0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
|
||||
0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,
|
||||
0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,
|
||||
0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,
|
||||
0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,
|
||||
0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,
|
||||
0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,
|
||||
0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
|
||||
0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,
|
||||
0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,
|
||||
0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,
|
||||
0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,
|
||||
0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,
|
||||
0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,
|
||||
0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
|
||||
0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000,
|
||||
0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,
|
||||
0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,
|
||||
0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
|
||||
0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,
|
||||
0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,
|
||||
0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
|
||||
0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,
|
||||
0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,
|
||||
0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,
|
||||
0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,
|
||||
0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,
|
||||
0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,
|
||||
0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
|
||||
0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,
|
||||
0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,
|
||||
0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,
|
||||
0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,
|
||||
0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,
|
||||
0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,
|
||||
0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
|
||||
0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,
|
||||
0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,
|
||||
0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,
|
||||
0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,
|
||||
0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,
|
||||
0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,
|
||||
0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
|
||||
0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,
|
||||
0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,
|
||||
0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,
|
||||
0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000,
|
||||
0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,
|
||||
0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,
|
||||
0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
|
||||
0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,
|
||||
0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,
|
||||
0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,
|
||||
0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,
|
||||
0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,
|
||||
0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,
|
||||
0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
|
||||
0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,
|
||||
0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,
|
||||
0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,
|
||||
0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,
|
||||
0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,
|
||||
0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,
|
||||
0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
|
||||
0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,
|
||||
0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,
|
||||
0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,
|
||||
0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,
|
||||
0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,
|
||||
0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,
|
||||
0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
|
||||
0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,
|
||||
0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,
|
||||
0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,
|
||||
0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,
|
||||
0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,
|
||||
0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,
|
||||
0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
|
||||
0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,
|
||||
0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,
|
||||
0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,
|
||||
0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,
|
||||
0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,
|
||||
0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,
|
||||
0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
|
||||
0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,
|
||||
0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000
|
||||
};
|
||||
static const unsigned int exponent_table[64] = {
|
||||
0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000,
|
||||
0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,
|
||||
0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
|
||||
0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000
|
||||
};
|
||||
static const unsigned short offset_table[64] = {
|
||||
0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
|
||||
0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024
|
||||
};
|
||||
ConversionBits bits;
|
||||
bits.i32 = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] +
|
||||
exponent_table[value >> 10];
|
||||
return bits.f32;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// CLBLAST_HALF_H_
|
||||
#endif
|
|
@ -67,15 +67,15 @@ class Database {
|
|||
};
|
||||
|
||||
// The database consists of separate database entries, stored together in a vector
|
||||
static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
|
||||
static const DatabaseEntry XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
|
||||
static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
|
||||
static const DatabaseEntry XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
|
||||
static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
|
||||
static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
|
||||
static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
|
||||
static const DatabaseEntry TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
|
||||
static const DatabaseEntry PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
|
||||
static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
|
||||
static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
|
||||
static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
|
||||
static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
|
||||
static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
|
||||
static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
|
||||
static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
|
||||
static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
|
||||
static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
|
||||
static const std::vector<DatabaseEntry> database;
|
||||
|
||||
// The constructor
|
||||
|
|
|
@ -14,6 +14,24 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::CopyHalf = {
|
||||
"Copy", Precision::kHalf, {
|
||||
{ // Intel GPUs
|
||||
kDeviceTypeGPU, "Intel", {
|
||||
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
||||
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
||||
}
|
||||
},
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::CopySingle = {
|
||||
"Copy", Precision::kSingle, {
|
||||
{ // AMD GPUs
|
||||
|
|
|
@ -14,6 +14,24 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::PadHalf = {
|
||||
"Pad", Precision::kHalf, {
|
||||
{ // Intel GPUs
|
||||
kDeviceTypeGPU, "Intel", {
|
||||
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||
}
|
||||
},
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::PadSingle = {
|
||||
"Pad", Precision::kSingle, {
|
||||
{ // AMD GPUs
|
||||
|
|
|
@ -14,6 +14,24 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::PadtransposeHalf = {
|
||||
"Padtranspose", Precision::kHalf, {
|
||||
{ // Intel GPUs
|
||||
kDeviceTypeGPU, "Intel", {
|
||||
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||
}
|
||||
},
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::PadtransposeSingle = {
|
||||
"Padtranspose", Precision::kSingle, {
|
||||
{ // AMD GPUs
|
||||
|
|
|
@ -14,6 +14,24 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::TransposeHalf = {
|
||||
"Transpose", Precision::kHalf, {
|
||||
{ // Intel GPUs
|
||||
kDeviceTypeGPU, "Intel", {
|
||||
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||
}
|
||||
},
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::TransposeSingle = {
|
||||
"Transpose", Precision::kSingle, {
|
||||
{ // AMD GPUs
|
||||
|
|
|
@ -14,6 +14,24 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::XaxpyHalf = {
|
||||
"Xaxpy", Precision::kHalf, {
|
||||
{ // Intel GPUs
|
||||
kDeviceTypeGPU, "Intel", {
|
||||
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
|
||||
{ "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
|
||||
}
|
||||
},
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"VW",4}, {"WGS",512}, {"WPT",8} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::XaxpySingle = {
|
||||
"Xaxpy", Precision::kSingle, {
|
||||
{ // AMD GPUs
|
||||
|
|
|
@ -14,6 +14,24 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::XdotHalf = {
|
||||
"Xdot", Precision::kHalf, {
|
||||
{ // Intel GPUs
|
||||
kDeviceTypeGPU, "Intel", {
|
||||
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",32}, {"WGS2",32} } },
|
||||
{ "default", { {"WGS1",32}, {"WGS2",32} } },
|
||||
}
|
||||
},
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"WGS1",32}, {"WGS2",32} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::XdotSingle = {
|
||||
"Xdot", Precision::kSingle, {
|
||||
{ // AMD GPUs
|
||||
|
|
|
@ -14,6 +14,18 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::XgemmHalf = {
|
||||
"Xgemm", Precision::kHalf, {
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::XgemmSingle = {
|
||||
"Xgemm", Precision::kSingle, {
|
||||
{ // AMD GPUs
|
||||
|
|
|
@ -14,6 +14,24 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::XgemvHalf = {
|
||||
"Xgemv", Precision::kHalf, {
|
||||
{ // Intel GPUs
|
||||
kDeviceTypeGPU, "Intel", {
|
||||
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||
}
|
||||
},
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::XgemvSingle = {
|
||||
"Xgemv", Precision::kSingle, {
|
||||
{ // AMD GPUs
|
||||
|
|
|
@ -14,6 +14,24 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::XgerHalf = {
|
||||
"Xger", Precision::kHalf, {
|
||||
{ // Intel GPUs
|
||||
kDeviceTypeGPU, "Intel", {
|
||||
{ "Intel(R) HD Graphics Skylake ULT GT2", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
|
||||
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
|
||||
}
|
||||
},
|
||||
{ // Default
|
||||
kDeviceTypeAll, "default", {
|
||||
{ "default", { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
|
||||
}
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
const Database::DatabaseEntry Database::XgerSingle = {
|
||||
"Xger", Precision::kSingle, {
|
||||
{ // AMD GPUs
|
||||
|
|
|
@ -29,6 +29,7 @@ class Xaxpy: public Routine<T> {
|
|||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
|
|
|
@ -29,6 +29,7 @@ class Xgemv: public Routine<T> {
|
|||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
|
|
|
@ -29,6 +29,7 @@ class Xger: public Routine<T> {
|
|||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
|
|
|
@ -29,6 +29,7 @@ class Xher: public Routine<T> {
|
|||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestMatrixA;
|
||||
|
|
|
@ -29,6 +29,7 @@ class Xher2: public Routine<T> {
|
|||
using Routine<T>::source_string_;
|
||||
using Routine<T>::queue_;
|
||||
using Routine<T>::event_;
|
||||
using Routine<T>::context_;
|
||||
using Routine<T>::GetProgramFromCache;
|
||||
using Routine<T>::TestVectorX;
|
||||
using Routine<T>::TestVectorY;
|
||||
|
|
|
@ -20,6 +20,8 @@
|
|||
|
||||
#include <cltune.h>
|
||||
|
||||
#include "internal/utilities.h"
|
||||
|
||||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <complex>
|
||||
|
||||
#include "clblast.h"
|
||||
#include "clblast_half.h"
|
||||
#include "internal/clpp11.h"
|
||||
|
||||
namespace clblast {
|
||||
|
@ -94,6 +95,16 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
// Returns a scalar with a default value
|
||||
template <typename T>
|
||||
T GetScalar();
|
||||
|
||||
// Returns a scalar of value 1
|
||||
template <typename T>
|
||||
T ConstantOne();
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Structure containing all possible arguments for test clients, including their default values
|
||||
template <typename T>
|
||||
struct Arguments {
|
||||
|
@ -124,8 +135,8 @@ struct Arguments {
|
|||
size_t nrm2_offset = 0;
|
||||
size_t asum_offset = 0;
|
||||
size_t imax_offset = 0;
|
||||
T alpha = T{1.0};
|
||||
T beta = T{1.0};
|
||||
T alpha = ConstantOne<T>();
|
||||
T beta = ConstantOne<T>();
|
||||
size_t x_size = 1;
|
||||
size_t y_size = 1;
|
||||
size_t a_size = 1;
|
||||
|
@ -202,9 +213,13 @@ void PopulateVector(std::vector<T> &vector);
|
|||
|
||||
// =================================================================================================
|
||||
|
||||
// Returns a scalar with a default value
|
||||
template <typename T>
|
||||
T GetScalar();
|
||||
// Conversion between half and single-precision
|
||||
std::vector<float> HalfToFloatBuffer(const std::vector<half>& source);
|
||||
void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source);
|
||||
|
||||
// As above, but now for OpenCL data-types instead of std::vectors
|
||||
Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
|
||||
void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
|
105
samples/haxpy.c
Normal file
105
samples/haxpy.c
Normal file
|
@ -0,0 +1,105 @@
|
|||
|
||||
// =================================================================================================
|
||||
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
|
||||
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
|
||||
// width of 100 characters per line.
|
||||
//
|
||||
// Author(s):
|
||||
// Cedric Nugteren <www.cedricnugteren.nl>
|
||||
//
|
||||
// This file demonstrates the use of the HAXPY routine. It demonstrates the use of half-precision.
|
||||
//
|
||||
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
|
||||
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
|
||||
//
|
||||
// =================================================================================================
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
// Includes the CLBlast library (C interface)
|
||||
#include <clblast_c.h>
|
||||
|
||||
// Includes the float-to-half and half-to-float conversion utilities
|
||||
#include <clblast_half.h>
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Example use of the half-precision routine HAXPY
|
||||
int main(void) {
|
||||
|
||||
// OpenCL platform/device settings
|
||||
const size_t platform_id = 0;
|
||||
const size_t device_id = 0;
|
||||
|
||||
// Example HAXPY arguments
|
||||
const size_t n = 8192;
|
||||
const cl_half alpha = FloatToHalf(0.5f);
|
||||
|
||||
// Initializes the OpenCL platform
|
||||
cl_uint num_platforms;
|
||||
clGetPlatformIDs(0, NULL, &num_platforms);
|
||||
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
|
||||
clGetPlatformIDs(num_platforms, platforms, NULL);
|
||||
cl_platform_id platform = platforms[platform_id];
|
||||
|
||||
// Initializes the OpenCL device
|
||||
cl_uint num_devices;
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
||||
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
||||
cl_device_id device = devices[device_id];
|
||||
|
||||
// Creates the OpenCL context, queue, and an event
|
||||
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
|
||||
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
|
||||
cl_event event = NULL;
|
||||
|
||||
// Populate host vectors with some example data
|
||||
cl_half* host_a = (cl_half*)malloc(sizeof(cl_half)*n);
|
||||
cl_half* host_b = (cl_half*)malloc(sizeof(cl_half)*n);
|
||||
for (size_t i=0; i<n; ++i) { host_a[i] = FloatToHalf(2.2f); }
|
||||
for (size_t i=0; i<n; ++i) { host_b[i] = FloatToHalf(0.4f); }
|
||||
printf("Input values at index 0: alpha * a[0] + b[0] == %.3lf * %.3lf + %.3lf\n",
|
||||
HalfToFloat(alpha), HalfToFloat(host_a[0]), HalfToFloat(host_b[0]));
|
||||
|
||||
// Copy the matrices to the device
|
||||
cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(cl_half), NULL, NULL);
|
||||
cl_mem device_b = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(cl_half), NULL, NULL);
|
||||
clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, n*sizeof(cl_half), host_a, 0, NULL, NULL);
|
||||
clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
|
||||
|
||||
// Call the HAXPY routine.
|
||||
StatusCode status = CLBlastHaxpy(n, alpha,
|
||||
device_a, 0, 1,
|
||||
device_b, 0, 1,
|
||||
&queue, &event);
|
||||
|
||||
// Wait for completion
|
||||
clWaitForEvents(1, &event);
|
||||
|
||||
// Copies the result back to the host
|
||||
clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
|
||||
|
||||
// Example completed. See "clblast_c.h" for status codes (0 -> success).
|
||||
printf("Completed HAXPY with status %d\n", status);
|
||||
|
||||
// Prints the first output value
|
||||
if (status == 0) {
|
||||
printf("Output value at index 0: b[0] = %.3lf\n", HalfToFloat(host_b[0]));
|
||||
}
|
||||
|
||||
// Clean-up
|
||||
free(platforms);
|
||||
free(devices);
|
||||
free(host_a);
|
||||
free(host_b);
|
||||
clReleaseMemObject(device_a);
|
||||
clReleaseMemObject(device_b);
|
||||
clReleaseCommandQueue(queue);
|
||||
clReleaseContext(context);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
|
@ -189,13 +189,20 @@ def GetFooter():
|
|||
|
||||
# The start of a new C++ precision entry
|
||||
def GetPrecision(family, precision):
|
||||
precisionstring = "Single"
|
||||
if precision == "64":
|
||||
precisionstring = ""
|
||||
if precision == "16":
|
||||
precisionstring = "Half"
|
||||
elif precision == "32":
|
||||
precisionstring = "Single"
|
||||
elif precision == "64":
|
||||
precisionstring = "Double"
|
||||
elif precision == "3232":
|
||||
precisionstring = "ComplexSingle"
|
||||
elif precision == "6464":
|
||||
precisionstring = "ComplexDouble"
|
||||
else:
|
||||
print("[ERROR] Unknown precision")
|
||||
sys.exit()
|
||||
return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n \"%s\", Precision::k%s, {\n"
|
||||
% (family.title(), precisionstring, family.title(), precisionstring))
|
||||
|
||||
|
|
|
@ -13,10 +13,13 @@
|
|||
# ==================================================================================================
|
||||
|
||||
# Short-hands for data-types
|
||||
HLF = "half"
|
||||
FLT = "float"
|
||||
DBL = "double"
|
||||
FLT2 = "float2"
|
||||
DBL2 = "double2"
|
||||
|
||||
HCL = "cl_half"
|
||||
F2CL = "cl_float2"
|
||||
D2CL = "cl_double2"
|
||||
|
||||
|
|
|
@ -28,11 +28,12 @@ import os.path
|
|||
|
||||
# Local files
|
||||
from routine import Routine
|
||||
from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL
|
||||
from datatype import DataType, HLF, FLT, DBL, FLT2, DBL2, HCL, F2CL, D2CL
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
# Regular data-types
|
||||
H = DataType("H", "H", HLF, [HLF, HLF, HCL, HCL], HLF ) # half (16)
|
||||
S = DataType("S", "S", FLT, [FLT, FLT, FLT, FLT], FLT ) # single (32)
|
||||
D = DataType("D", "D", DBL, [DBL, DBL, DBL, DBL], DBL ) # double (64)
|
||||
C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
|
||||
|
@ -41,6 +42,7 @@ Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6
|
|||
# Special cases
|
||||
Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output
|
||||
Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output
|
||||
iH = DataType("H", "iH", HLF, [HLF, HLF, HLF, HLF], HLF ) # As H, but with integer output
|
||||
iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output
|
||||
iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output
|
||||
iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output
|
||||
|
@ -60,62 +62,62 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T")
|
|||
# Populates a list of routines
|
||||
routines = [
|
||||
[ # Level 1: vector-vector
|
||||
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
|
||||
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
|
||||
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
|
||||
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
|
||||
Routine(True, True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []),
|
||||
Routine(True, True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []),
|
||||
Routine(True, True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []),
|
||||
Routine(True, True, "1", "axpy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []),
|
||||
Routine(True, True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []),
|
||||
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
|
||||
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
|
||||
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []),
|
||||
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []),
|
||||
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
|
||||
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []),
|
||||
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
|
||||
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
|
||||
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
|
||||
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
|
||||
Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
|
||||
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
|
||||
Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []),
|
||||
Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []),
|
||||
Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []),
|
||||
Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []),
|
||||
Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []),
|
||||
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
|
||||
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
|
||||
Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []),
|
||||
Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []),
|
||||
Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
|
||||
Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []),
|
||||
Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
|
||||
Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
|
||||
],
|
||||
[ # Level 2: matrix-vector
|
||||
Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []),
|
||||
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []),
|
||||
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []),
|
||||
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []),
|
||||
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []),
|
||||
Routine(True, True, "2a", "symv", T, [S,D], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []),
|
||||
Routine(True, True, "2a", "sbmv", T, [S,D], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []),
|
||||
Routine(True, True, "2a", "spmv", T, [S,D], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []),
|
||||
Routine(True, True, "2a", "trmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []),
|
||||
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []),
|
||||
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []),
|
||||
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
|
||||
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []),
|
||||
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
|
||||
Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []),
|
||||
Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []),
|
||||
Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []),
|
||||
Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []),
|
||||
Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []),
|
||||
Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []),
|
||||
Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []),
|
||||
Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []),
|
||||
Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []),
|
||||
Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []),
|
||||
Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []),
|
||||
Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
|
||||
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []),
|
||||
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
|
||||
# Level 2: matrix update
|
||||
Routine(True, True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []),
|
||||
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []),
|
||||
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []),
|
||||
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []),
|
||||
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []),
|
||||
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []),
|
||||
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []),
|
||||
Routine(True, True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []),
|
||||
Routine(True, True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []),
|
||||
Routine(True, True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []),
|
||||
Routine(True, True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []),
|
||||
Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []),
|
||||
Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []),
|
||||
Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []),
|
||||
Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []),
|
||||
Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []),
|
||||
Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []),
|
||||
Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []),
|
||||
Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []),
|
||||
Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []),
|
||||
Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []),
|
||||
Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []),
|
||||
],
|
||||
[ # Level 3: matrix-matrix
|
||||
Routine(True, True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []),
|
||||
Routine(True, True, "3", "symm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []),
|
||||
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []),
|
||||
Routine(True, True, "3", "syrk", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []),
|
||||
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []),
|
||||
Routine(True, True, "3", "syr2k", T, [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []),
|
||||
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []),
|
||||
Routine(True, True, "3", "trmm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []),
|
||||
Routine(False, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
|
||||
Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []),
|
||||
Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []),
|
||||
Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []),
|
||||
Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []),
|
||||
Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []),
|
||||
Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []),
|
||||
Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []),
|
||||
Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []),
|
||||
Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
|
||||
]]
|
||||
|
||||
# ==================================================================================================
|
||||
|
@ -229,21 +231,45 @@ def wrapper_clblas(routines):
|
|||
result = ""
|
||||
for routine in routines:
|
||||
if routine.has_tests:
|
||||
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
|
||||
result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNamesTested())
|
||||
if routine.NoScalars():
|
||||
result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
|
||||
for flavour in routine.flavours:
|
||||
indent = " "*(17 + routine.Length())
|
||||
result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
|
||||
arguments = routine.ArgumentsWrapperCL(flavour)
|
||||
if routine.scratch:
|
||||
result += " auto queue = Queue(queues[0]);\n"
|
||||
result += " auto context = queue.GetContext();\n"
|
||||
result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
|
||||
arguments += ["scratch_buffer()"]
|
||||
result += " return clblas"+flavour.name+routine.name+"("
|
||||
result += (",\n"+indent).join([a for a in arguments])
|
||||
result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
|
||||
|
||||
# There is a version available in clBLAS
|
||||
if flavour.precision_name in ["S","D","C","Z"]:
|
||||
indent = " "*(17 + routine.Length())
|
||||
arguments = routine.ArgumentsWrapperCL(flavour)
|
||||
if routine.scratch:
|
||||
result += " auto queue = Queue(queues[0]);\n"
|
||||
result += " auto context = queue.GetContext();\n"
|
||||
result += " auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
|
||||
arguments += ["scratch_buffer()"]
|
||||
result += " return clblas"+flavour.name+routine.name+"("
|
||||
result += (",\n"+indent).join([a for a in arguments])
|
||||
result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
|
||||
|
||||
# There is no clBLAS available, forward the call to one of the available functions
|
||||
else: # Half-precision
|
||||
indent = " "*(24 + routine.Length())
|
||||
|
||||
# Convert to float (note: also integer buffers are stored as half/float)
|
||||
for buf in routine.inputs + routine.outputs:
|
||||
result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer, queues[0]);\n"
|
||||
|
||||
# Call the float routine
|
||||
result += " auto status = clblasX"+routine.name+"("
|
||||
result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()])
|
||||
result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
|
||||
result += "\n"
|
||||
|
||||
# Convert back to half
|
||||
for buf in routine.outputs:
|
||||
result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis, queues[0]);\n"
|
||||
result += " return status;"
|
||||
|
||||
# Complete
|
||||
result += "\n}\n"
|
||||
return result
|
||||
|
||||
|
@ -252,44 +278,66 @@ def wrapper_cblas(routines):
|
|||
result = ""
|
||||
for routine in routines:
|
||||
if routine.has_tests:
|
||||
result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames())
|
||||
result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNamesTested())
|
||||
for flavour in routine.flavours:
|
||||
indent = " "*(10 + routine.Length())
|
||||
result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
|
||||
arguments = routine.ArgumentsWrapperC(flavour)
|
||||
|
||||
# Double-precision scalars
|
||||
for scalar in routine.scalars:
|
||||
if flavour.IsComplex(scalar):
|
||||
result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
|
||||
# There is a version available in CBLAS
|
||||
if flavour.precision_name in ["S","D","C","Z"]:
|
||||
indent = " "*(10 + routine.Length())
|
||||
arguments = routine.ArgumentsWrapperC(flavour)
|
||||
|
||||
# Special case for scalar outputs
|
||||
assignment = ""
|
||||
postfix = ""
|
||||
endofline = ""
|
||||
extra_argument = ""
|
||||
for output_buffer in routine.outputs:
|
||||
if output_buffer in routine.ScalarBuffersFirst():
|
||||
if flavour in [C,Z]:
|
||||
postfix += "_sub"
|
||||
indent += " "
|
||||
extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
|
||||
elif output_buffer in routine.IndexBuffers():
|
||||
assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
|
||||
indent += " "*len(assignment)
|
||||
else:
|
||||
assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
|
||||
if (flavour.name in ["Sc","Dz"]):
|
||||
assignment = assignment+".real("
|
||||
endofline += ")"
|
||||
# Complex scalars
|
||||
for scalar in routine.scalars:
|
||||
if flavour.IsComplex(scalar):
|
||||
result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
|
||||
|
||||
# Special case for scalar outputs
|
||||
assignment = ""
|
||||
postfix = ""
|
||||
endofline = ""
|
||||
extra_argument = ""
|
||||
for output_buffer in routine.outputs:
|
||||
if output_buffer in routine.ScalarBuffersFirst():
|
||||
if flavour in [C,Z]:
|
||||
postfix += "_sub"
|
||||
indent += " "
|
||||
extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
|
||||
elif output_buffer in routine.IndexBuffers():
|
||||
assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
|
||||
indent += " "*len(assignment)
|
||||
else:
|
||||
assignment = assignment+" = "
|
||||
indent += " "*len(assignment)
|
||||
assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
|
||||
if (flavour.name in ["Sc","Dz"]):
|
||||
assignment = assignment+".real("
|
||||
endofline += ")"
|
||||
else:
|
||||
assignment = assignment+" = "
|
||||
indent += " "*len(assignment)
|
||||
|
||||
result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
|
||||
result += (",\n"+indent).join([a for a in arguments])
|
||||
result += extra_argument+endofline+");"
|
||||
result += "\n}\n"
|
||||
result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
|
||||
result += (",\n"+indent).join([a for a in arguments])
|
||||
result += extra_argument+endofline+");\n"
|
||||
|
||||
# There is no CBLAS available, forward the call to one of the available functions
|
||||
else: # Half-precision
|
||||
indent = " "*(9 + routine.Length())
|
||||
|
||||
# Convert to float (note: also integer buffers are stored as half/float)
|
||||
for buf in routine.inputs + routine.outputs:
|
||||
result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer);\n"
|
||||
|
||||
# Call the float routine
|
||||
result += " cblasX"+routine.name+"("
|
||||
result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()])
|
||||
result += ");\n"
|
||||
|
||||
# Convert back to half
|
||||
for buf in routine.outputs:
|
||||
result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis);\n"
|
||||
|
||||
# Complete
|
||||
result += "}\n"
|
||||
return result
|
||||
|
||||
# ==================================================================================================
|
||||
|
|
|
@ -99,6 +99,18 @@ class Routine():
|
|||
def IndexBuffers(self):
|
||||
return ["imax","imin"]
|
||||
|
||||
# Lists of input/output buffers not index (integer)
|
||||
def NonIndexInputs(self):
|
||||
buffers = self.inputs[:] # make a copy
|
||||
for i in self.IndexBuffers():
|
||||
if i in buffers: buffers.remove(i)
|
||||
return buffers
|
||||
def NonIndexOutputs(self):
|
||||
buffers = self.outputs[:] # make a copy
|
||||
for i in self.IndexBuffers():
|
||||
if i in buffers: buffers.remove(i)
|
||||
return buffers
|
||||
|
||||
# List of buffers without 'inc' or 'ld'
|
||||
def BuffersWithoutLdInc(self):
|
||||
return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"]
|
||||
|
@ -119,6 +131,12 @@ class Routine():
|
|||
def ShortNames(self):
|
||||
return "/".join([f.name+self.name.upper() for f in self.flavours])
|
||||
|
||||
# As above, but excludes some
|
||||
def ShortNamesTested(self):
|
||||
names = [f.name+self.name.upper() for f in self.flavours]
|
||||
if "H"+self.name.upper() in names: names.remove("H"+self.name.upper())
|
||||
return "/".join(names)
|
||||
|
||||
# Determines which buffers go first (between alpha and beta) and which ones go after
|
||||
def BuffersFirst(self):
|
||||
if self.level == "2b":
|
||||
|
@ -146,6 +164,17 @@ class Routine():
|
|||
return [", ".join(a+b+c)]
|
||||
return []
|
||||
|
||||
# As above but with a '_bis' suffix for the buffer name
|
||||
def BufferBis(self, name):
|
||||
#if (name in self.IndexBuffers()):
|
||||
# return self.Buffer(name)
|
||||
if (name in self.inputs) or (name in self.outputs):
|
||||
a = [name+"_buffer_bis"]
|
||||
b = [name+"_offset"]
|
||||
c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
|
||||
return [", ".join(a+b+c)]
|
||||
return []
|
||||
|
||||
# As above but with data-types
|
||||
def BufferDef(self, name):
|
||||
prefix = "const " if (name in self.inputs) else ""
|
||||
|
@ -156,6 +185,16 @@ class Routine():
|
|||
return [", ".join(a+b+c)]
|
||||
return []
|
||||
|
||||
# As above but with data-types
|
||||
def BufferDefWrapperCL(self, name, flavour):
|
||||
prefix = "const " if (name in self.inputs) else ""
|
||||
if (name in self.inputs) or (name in self.outputs):
|
||||
a = [prefix+"Buffer<"+flavour.buffertype+">& "+name+"_buffer"]
|
||||
b = ["const size_t "+name+"_offset"]
|
||||
c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
|
||||
return [", ".join(a+b+c)]
|
||||
return []
|
||||
|
||||
# As above but as vectors
|
||||
def BufferDefVector(self, name, flavour):
|
||||
prefix = "const " if (name in self.inputs) else ""
|
||||
|
@ -179,7 +218,7 @@ class Routine():
|
|||
# As above but with a static cast for clBLAS wrapper
|
||||
def BufferWrapperCL(self, name):
|
||||
if (name in self.inputs) or (name in self.outputs):
|
||||
a = [name+"_buffer"]
|
||||
a = [name+"_buffer()"]
|
||||
b = [name+"_offset"]
|
||||
c = []
|
||||
if (name in ["x","y"]):
|
||||
|
@ -238,6 +277,12 @@ class Routine():
|
|||
return [name]
|
||||
return []
|
||||
|
||||
# As above, but converts from float to half
|
||||
def ScalarHalfToFloat(self, name):
|
||||
if name in self.scalars:
|
||||
return ["HalfToFloat("+name+")"]
|
||||
return []
|
||||
|
||||
# Retrieves the use of a scalar (alpha/beta)
|
||||
def ScalarUse(self, name, flavour):
|
||||
if name in self.scalars:
|
||||
|
@ -248,7 +293,7 @@ class Routine():
|
|||
return [name]
|
||||
return []
|
||||
|
||||
# Retrieves the use of a scalar (alpha/beta)
|
||||
# As above, but for the clBLAS wrapper
|
||||
def ScalarUseWrapper(self, name, flavour):
|
||||
if name in self.scalars:
|
||||
if name == "alpha":
|
||||
|
@ -258,7 +303,7 @@ class Routine():
|
|||
return [name]
|
||||
return []
|
||||
|
||||
# Retrieves the use of a scalar for CBLAS (alpha/beta)
|
||||
# As above, but for the CBLAS wrapper
|
||||
def ScalarUseWrapperC(self, name, flavour):
|
||||
if name in self.scalars:
|
||||
if flavour.IsComplex(name):
|
||||
|
@ -377,6 +422,28 @@ class Routine():
|
|||
|
||||
# ==============================================================================================
|
||||
|
||||
# Retrieves a combination of all the argument names (no types)
|
||||
def Arguments(self):
|
||||
return (self.Options() + self.Sizes() +
|
||||
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
|
||||
self.Scalar("alpha") +
|
||||
list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
|
||||
self.Scalar("beta") +
|
||||
list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
|
||||
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) +
|
||||
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
|
||||
|
||||
# As above, but with conversions from half to float
|
||||
def ArgumentsHalf(self):
|
||||
return (self.Options() + self.Sizes() +
|
||||
list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersFirst()])) +
|
||||
self.ScalarHalfToFloat("alpha") +
|
||||
list(chain(*[self.BufferBis(b) for b in self.BuffersFirst()])) +
|
||||
self.ScalarHalfToFloat("beta") +
|
||||
list(chain(*[self.BufferBis(b) for b in self.BuffersSecond()])) +
|
||||
list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersSecond()])) +
|
||||
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
|
||||
|
||||
# Retrieves a combination of all the argument names, with Claduc casts
|
||||
def ArgumentsCladuc(self, flavour, indent):
|
||||
return (self.Options() + self.Sizes() +
|
||||
|
@ -388,7 +455,7 @@ class Routine():
|
|||
list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) +
|
||||
list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
|
||||
|
||||
# Retrieves a combination of all the argument names, with CLBlast casts
|
||||
# As above, but with CLBlast casts
|
||||
def ArgumentsCast(self, flavour, indent):
|
||||
return (self.OptionsCast(indent) + self.Sizes() +
|
||||
list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
|
||||
|
@ -434,12 +501,12 @@ class Routine():
|
|||
# As above, but clBLAS wrapper plain datatypes
|
||||
def ArgumentsDefWrapperCL(self, flavour):
|
||||
return (self.OptionsDefWrapperCL() + self.SizesDef() +
|
||||
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
|
||||
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersFirst()])) +
|
||||
self.ScalarDefPlain("alpha", flavour) +
|
||||
list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
|
||||
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersFirst()])) +
|
||||
self.ScalarDefPlain("beta", flavour) +
|
||||
list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
|
||||
list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
|
||||
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersSecond()])) +
|
||||
list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersSecond()])) +
|
||||
list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
|
||||
|
||||
# As above, but CBLAS wrapper plain datatypes
|
||||
|
|
237
src/clblast.cc
237
src/clblast.cc
|
@ -160,7 +160,7 @@ template StatusCode PUBLIC_API Rotm<double>(const size_t,
|
|||
cl_mem, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
|
||||
// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
|
||||
template <typename T>
|
||||
StatusCode Swap(const size_t n,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -190,8 +190,12 @@ template StatusCode PUBLIC_API Swap<double2>(const size_t,
|
|||
cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Swap<half>(const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
|
||||
// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
|
||||
template <typename T>
|
||||
StatusCode Scal(const size_t n,
|
||||
const T alpha,
|
||||
|
@ -221,8 +225,12 @@ template StatusCode PUBLIC_API Scal<double2>(const size_t,
|
|||
const double2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Scal<half>(const size_t,
|
||||
const half,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
|
||||
// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
|
||||
template <typename T>
|
||||
StatusCode Copy(const size_t n,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
|
@ -252,8 +260,12 @@ template StatusCode PUBLIC_API Copy<double2>(const size_t,
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Copy<half>(const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
|
||||
// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
|
||||
template <typename T>
|
||||
StatusCode Axpy(const size_t n,
|
||||
const T alpha,
|
||||
|
@ -289,8 +301,13 @@ template StatusCode PUBLIC_API Axpy<double2>(const size_t,
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Axpy<half>(const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Dot product of two vectors: SDOT/DDOT
|
||||
// Dot product of two vectors: SDOT/DDOT/HDOT
|
||||
template <typename T>
|
||||
StatusCode Dot(const size_t n,
|
||||
cl_mem dot_buffer, const size_t dot_offset,
|
||||
|
@ -316,6 +333,11 @@ template StatusCode PUBLIC_API Dot<double>(const size_t,
|
|||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Dot<half>(const size_t,
|
||||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Dot product of two complex vectors: CDOTU/ZDOTU
|
||||
template <typename T>
|
||||
|
@ -371,7 +393,7 @@ template StatusCode PUBLIC_API Dotc<double2>(const size_t,
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
|
||||
// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
|
||||
template <typename T>
|
||||
StatusCode Nrm2(const size_t n,
|
||||
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||
|
@ -401,8 +423,12 @@ template StatusCode PUBLIC_API Nrm2<double2>(const size_t,
|
|||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Nrm2<half>(const size_t,
|
||||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
|
||||
// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
|
||||
template <typename T>
|
||||
StatusCode Asum(const size_t n,
|
||||
cl_mem asum_buffer, const size_t asum_offset,
|
||||
|
@ -432,8 +458,12 @@ template StatusCode PUBLIC_API Asum<double2>(const size_t,
|
|||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Asum<half>(const size_t,
|
||||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
|
||||
// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
|
||||
template <typename T>
|
||||
StatusCode Sum(const size_t n,
|
||||
cl_mem sum_buffer, const size_t sum_offset,
|
||||
|
@ -463,8 +493,12 @@ template StatusCode PUBLIC_API Sum<double2>(const size_t,
|
|||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Sum<half>(const size_t,
|
||||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
|
||||
// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
|
||||
template <typename T>
|
||||
StatusCode Amax(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
|
@ -494,8 +528,12 @@ template StatusCode PUBLIC_API Amax<double2>(const size_t,
|
|||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Amax<half>(const size_t,
|
||||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
|
||||
// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
|
||||
template <typename T>
|
||||
StatusCode Max(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
|
@ -525,8 +563,12 @@ template StatusCode PUBLIC_API Max<double2>(const size_t,
|
|||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Max<half>(const size_t,
|
||||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
|
||||
// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
|
||||
template <typename T>
|
||||
StatusCode Min(const size_t n,
|
||||
cl_mem imin_buffer, const size_t imin_offset,
|
||||
|
@ -556,12 +598,16 @@ template StatusCode PUBLIC_API Min<double2>(const size_t,
|
|||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Min<half>(const size_t,
|
||||
cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// =================================================================================================
|
||||
// BLAS level-2 (matrix-vector) routines
|
||||
// =================================================================================================
|
||||
|
||||
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
|
||||
// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
|
||||
template <typename T>
|
||||
StatusCode Gemv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n,
|
||||
|
@ -615,8 +661,16 @@ template StatusCode PUBLIC_API Gemv<double2>(const Layout, const Transpose,
|
|||
const double2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Gemv<half>(const Layout, const Transpose,
|
||||
const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const half,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
|
||||
// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
|
||||
template <typename T>
|
||||
StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n, const size_t kl, const size_t ku,
|
||||
|
@ -670,6 +724,14 @@ template StatusCode PUBLIC_API Gbmv<double2>(const Layout, const Transpose,
|
|||
const double2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Gbmv<half>(const Layout, const Transpose,
|
||||
const size_t, const size_t, const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const half,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
|
||||
template <typename T>
|
||||
|
@ -788,7 +850,7 @@ template StatusCode PUBLIC_API Hpmv<double2>(const Layout, const Triangle,
|
|||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Symmetric matrix-vector multiplication: SSYMV/DSYMV
|
||||
// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
|
||||
template <typename T>
|
||||
StatusCode Symv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -826,8 +888,16 @@ template StatusCode PUBLIC_API Symv<double>(const Layout, const Triangle,
|
|||
const double,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Symv<half>(const Layout, const Triangle,
|
||||
const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const half,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
|
||||
// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
|
||||
template <typename T>
|
||||
StatusCode Sbmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n, const size_t k,
|
||||
|
@ -865,8 +935,16 @@ template StatusCode PUBLIC_API Sbmv<double>(const Layout, const Triangle,
|
|||
const double,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Sbmv<half>(const Layout, const Triangle,
|
||||
const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const half,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
|
||||
// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
|
||||
template <typename T>
|
||||
StatusCode Spmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -904,8 +982,16 @@ template StatusCode PUBLIC_API Spmv<double>(const Layout, const Triangle,
|
|||
const double,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Spmv<half>(const Layout, const Triangle,
|
||||
const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const half,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
|
||||
// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
|
||||
template <typename T>
|
||||
StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
|
@ -941,8 +1027,13 @@ template StatusCode PUBLIC_API Trmv<double2>(const Layout, const Triangle, const
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Trmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
|
||||
// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
|
||||
template <typename T>
|
||||
StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n, const size_t k,
|
||||
|
@ -978,8 +1069,13 @@ template StatusCode PUBLIC_API Tbmv<double2>(const Layout, const Triangle, const
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Tbmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
|
||||
// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
|
||||
template <typename T>
|
||||
StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
|
@ -1015,6 +1111,11 @@ template StatusCode PUBLIC_API Tpmv<double2>(const Layout, const Triangle, const
|
|||
const cl_mem, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t,
|
||||
const cl_mem, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
|
||||
template <typename T>
|
||||
|
@ -1106,7 +1207,7 @@ template StatusCode PUBLIC_API Tpsv<double2>(const Layout, const Triangle, const
|
|||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// General rank-1 matrix update: SGER/DGER
|
||||
// General rank-1 matrix update: SGER/DGER/HGER
|
||||
template <typename T>
|
||||
StatusCode Ger(const Layout layout,
|
||||
const size_t m, const size_t n,
|
||||
|
@ -1140,6 +1241,13 @@ template StatusCode PUBLIC_API Ger<double>(const Layout,
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Ger<half>(const Layout,
|
||||
const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// General rank-1 complex matrix update: CGERU/ZGERU
|
||||
template <typename T>
|
||||
|
@ -1343,7 +1451,7 @@ template StatusCode PUBLIC_API Hpr2<double2>(const Layout, const Triangle,
|
|||
cl_mem, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Symmetric rank-1 matrix update: SSYR/DSYR
|
||||
// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
|
||||
template <typename T>
|
||||
StatusCode Syr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -1373,8 +1481,14 @@ template StatusCode PUBLIC_API Syr<double>(const Layout, const Triangle,
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Syr<half>(const Layout, const Triangle,
|
||||
const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Symmetric packed rank-1 matrix update: SSPR/DSPR
|
||||
// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
|
||||
template <typename T>
|
||||
StatusCode Spr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -1404,8 +1518,14 @@ template StatusCode PUBLIC_API Spr<double>(const Layout, const Triangle,
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Spr<half>(const Layout, const Triangle,
|
||||
const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Symmetric rank-2 matrix update: SSYR2/DSYR2
|
||||
// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
|
||||
template <typename T>
|
||||
StatusCode Syr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -1439,8 +1559,15 @@ template StatusCode PUBLIC_API Syr2<double>(const Layout, const Triangle,
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Syr2<half>(const Layout, const Triangle,
|
||||
const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
|
||||
// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
|
||||
template <typename T>
|
||||
StatusCode Spr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
|
@ -1474,12 +1601,19 @@ template StatusCode PUBLIC_API Spr2<double>(const Layout, const Triangle,
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Spr2<half>(const Layout, const Triangle,
|
||||
const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// =================================================================================================
|
||||
// BLAS level-3 (matrix-matrix) routines
|
||||
// =================================================================================================
|
||||
|
||||
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
|
||||
// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
|
||||
template <typename T>
|
||||
StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
||||
const size_t m, const size_t n, const size_t k,
|
||||
|
@ -1533,8 +1667,16 @@ template StatusCode PUBLIC_API Gemm<double2>(const Layout, const Transpose, cons
|
|||
const double2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Gemm<half>(const Layout, const Transpose, const Transpose,
|
||||
const size_t, const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const half,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
|
||||
// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
|
||||
template <typename T>
|
||||
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
|
||||
const size_t m, const size_t n,
|
||||
|
@ -1588,6 +1730,14 @@ template StatusCode PUBLIC_API Symm<double2>(const Layout, const Side, const Tri
|
|||
const double2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Symm<half>(const Layout, const Side, const Triangle,
|
||||
const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const half,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
|
||||
template <typename T>
|
||||
|
@ -1628,7 +1778,7 @@ template StatusCode PUBLIC_API Hemm<double2>(const Layout, const Side, const Tri
|
|||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
|
||||
// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
|
||||
template <typename T>
|
||||
StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||
const size_t n, const size_t k,
|
||||
|
@ -1676,6 +1826,13 @@ template StatusCode PUBLIC_API Syrk<double2>(const Layout, const Triangle, const
|
|||
const double2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Syrk<half>(const Layout, const Triangle, const Transpose,
|
||||
const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const half,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Rank-K update of a hermitian matrix: CHERK/ZHERK
|
||||
template <typename T>
|
||||
|
@ -1712,7 +1869,7 @@ template StatusCode PUBLIC_API Herk<double>(const Layout, const Triangle, const
|
|||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
|
||||
// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
|
||||
template <typename T>
|
||||
StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||
const size_t n, const size_t k,
|
||||
|
@ -1766,6 +1923,14 @@ template StatusCode PUBLIC_API Syr2k<double2>(const Layout, const Triangle, cons
|
|||
const double2,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Syr2k<half>(const Layout, const Triangle, const Transpose,
|
||||
const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
const half,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
|
||||
template <typename T, typename U>
|
||||
|
@ -1806,7 +1971,7 @@ template StatusCode PUBLIC_API Her2k<double2,double>(const Layout, const Triangl
|
|||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
|
||||
// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
|
||||
template <typename T>
|
||||
StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
|
@ -1848,8 +2013,14 @@ template StatusCode PUBLIC_API Trmm<double2>(const Layout, const Side, const Tri
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
|
||||
// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
|
||||
template <typename T>
|
||||
StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t, const size_t,
|
||||
|
@ -1883,6 +2054,12 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
|
|||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
|
||||
const size_t, const size_t,
|
||||
const half,
|
||||
const cl_mem, const size_t, const size_t,
|
||||
cl_mem, const size_t, const size_t,
|
||||
cl_command_queue*, cl_event*);
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
|
448
src/clblast_c.cc
448
src/clblast_c.cc
|
@ -178,6 +178,16 @@ StatusCode CLBlastZswap(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHswap(const size_t n,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Swap<half>(n,
|
||||
x_buffer, x_offset, x_inc,
|
||||
y_buffer, y_offset, y_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// SCAL
|
||||
StatusCode CLBlastSscal(const size_t n,
|
||||
|
@ -220,6 +230,16 @@ StatusCode CLBlastZscal(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHscal(const size_t n,
|
||||
const cl_half alpha,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Scal(n,
|
||||
alpha,
|
||||
x_buffer, x_offset, x_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// COPY
|
||||
StatusCode CLBlastScopy(const size_t n,
|
||||
|
@ -262,6 +282,16 @@ StatusCode CLBlastZcopy(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHcopy(const size_t n,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Copy<half>(n,
|
||||
x_buffer, x_offset, x_inc,
|
||||
y_buffer, y_offset, y_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// AXPY
|
||||
StatusCode CLBlastSaxpy(const size_t n,
|
||||
|
@ -312,6 +342,18 @@ StatusCode CLBlastZaxpy(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHaxpy(const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Axpy(n,
|
||||
alpha,
|
||||
x_buffer, x_offset, x_inc,
|
||||
y_buffer, y_offset, y_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// DOT
|
||||
StatusCode CLBlastSdot(const size_t n,
|
||||
|
@ -338,6 +380,18 @@ StatusCode CLBlastDdot(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHdot(const size_t n,
|
||||
cl_mem dot_buffer, const size_t dot_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Dot<half>(n,
|
||||
dot_buffer, dot_offset,
|
||||
x_buffer, x_offset, x_inc,
|
||||
y_buffer, y_offset, y_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// DOTU
|
||||
StatusCode CLBlastCdotu(const size_t n,
|
||||
|
@ -432,6 +486,16 @@ StatusCode CLBlastDznrm2(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHnrm2(const size_t n,
|
||||
cl_mem nrm2_buffer, const size_t nrm2_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Nrm2<half>(n,
|
||||
nrm2_buffer, nrm2_offset,
|
||||
x_buffer, x_offset, x_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// ASUM
|
||||
StatusCode CLBlastSasum(const size_t n,
|
||||
|
@ -474,6 +538,16 @@ StatusCode CLBlastDzasum(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHasum(const size_t n,
|
||||
cl_mem asum_buffer, const size_t asum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Asum<half>(n,
|
||||
asum_buffer, asum_offset,
|
||||
x_buffer, x_offset, x_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// SUM
|
||||
StatusCode CLBlastSsum(const size_t n,
|
||||
|
@ -516,6 +590,16 @@ StatusCode CLBlastDzsum(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHsum(const size_t n,
|
||||
cl_mem sum_buffer, const size_t sum_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Sum<half>(n,
|
||||
sum_buffer, sum_offset,
|
||||
x_buffer, x_offset, x_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// AMAX
|
||||
StatusCode CLBlastiSamax(const size_t n,
|
||||
|
@ -558,6 +642,16 @@ StatusCode CLBlastiZamax(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastiHamax(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Amax<half>(n,
|
||||
imax_buffer, imax_offset,
|
||||
x_buffer, x_offset, x_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// MAX
|
||||
StatusCode CLBlastiSmax(const size_t n,
|
||||
|
@ -600,6 +694,16 @@ StatusCode CLBlastiZmax(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastiHmax(const size_t n,
|
||||
cl_mem imax_buffer, const size_t imax_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Max<half>(n,
|
||||
imax_buffer, imax_offset,
|
||||
x_buffer, x_offset, x_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// MIN
|
||||
StatusCode CLBlastiSmin(const size_t n,
|
||||
|
@ -642,6 +746,16 @@ StatusCode CLBlastiZmin(const size_t n,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastiHmin(const size_t n,
|
||||
cl_mem imin_buffer, const size_t imin_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Min<half>(n,
|
||||
imin_buffer, imin_offset,
|
||||
x_buffer, x_offset, x_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
// BLAS level-2 (matrix-vector) routines
|
||||
|
@ -724,6 +838,25 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Gemv(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Transpose>(a_transpose),
|
||||
m, n,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
x_buffer, x_offset, x_inc,
|
||||
beta,
|
||||
y_buffer, y_offset, y_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// GBMV
|
||||
StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose,
|
||||
|
@ -802,6 +935,25 @@ StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
|
||||
const size_t m, const size_t n, const size_t kl, const size_t ku,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Transpose>(a_transpose),
|
||||
m, n, kl, ku,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
x_buffer, x_offset, x_inc,
|
||||
beta,
|
||||
y_buffer, y_offset, y_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// HEMV
|
||||
StatusCode CLBlastChemv(const Layout layout, const Triangle triangle,
|
||||
|
@ -962,6 +1114,25 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Symv(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
n,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
x_buffer, x_offset, x_inc,
|
||||
beta,
|
||||
y_buffer, y_offset, y_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// SBMV
|
||||
StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle,
|
||||
|
@ -1002,6 +1173,25 @@ StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
n, k,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
x_buffer, x_offset, x_inc,
|
||||
beta,
|
||||
y_buffer, y_offset, y_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// SPMV
|
||||
StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle,
|
||||
|
@ -1042,6 +1232,25 @@ StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem ap_buffer, const size_t ap_offset,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_half beta,
|
||||
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Spmv(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
n,
|
||||
alpha,
|
||||
ap_buffer, ap_offset,
|
||||
x_buffer, x_offset, x_inc,
|
||||
beta,
|
||||
y_buffer, y_offset, y_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// TRMV
|
||||
StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
|
@ -1104,6 +1313,21 @@ StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Tran
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Trmv<half>(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
static_cast<clblast::Transpose>(a_transpose),
|
||||
static_cast<clblast::Diagonal>(diagonal),
|
||||
n,
|
||||
a_buffer, a_offset, a_ld,
|
||||
x_buffer, x_offset, x_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// TBMV
|
||||
StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
|
@ -1166,6 +1390,21 @@ StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Tran
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n, const size_t k,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Tbmv<half>(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
static_cast<clblast::Transpose>(a_transpose),
|
||||
static_cast<clblast::Diagonal>(diagonal),
|
||||
n, k,
|
||||
a_buffer, a_offset, a_ld,
|
||||
x_buffer, x_offset, x_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// TPMV
|
||||
StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
|
@ -1228,6 +1467,21 @@ StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Tran
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t n,
|
||||
const cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Tpmv<half>(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
static_cast<clblast::Transpose>(a_transpose),
|
||||
static_cast<clblast::Diagonal>(diagonal),
|
||||
n,
|
||||
ap_buffer, ap_offset,
|
||||
x_buffer, x_offset, x_inc,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// TRSV
|
||||
StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
|
@ -1448,6 +1702,22 @@ StatusCode CLBlastDger(const Layout layout,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHger(const Layout layout,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Ger(static_cast<clblast::Layout>(layout),
|
||||
m, n,
|
||||
alpha,
|
||||
x_buffer, x_offset, x_inc,
|
||||
y_buffer, y_offset, y_inc,
|
||||
a_buffer, a_offset, a_ld,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// GERU
|
||||
StatusCode CLBlastCgeru(const Layout layout,
|
||||
|
@ -1684,6 +1954,21 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Syr(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
n,
|
||||
alpha,
|
||||
x_buffer, x_offset, x_inc,
|
||||
a_buffer, a_offset, a_ld,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// SPR
|
||||
StatusCode CLBlastSspr(const Layout layout, const Triangle triangle,
|
||||
|
@ -1716,6 +2001,21 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHspr(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Spr(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
n,
|
||||
alpha,
|
||||
x_buffer, x_offset, x_inc,
|
||||
ap_buffer, ap_offset,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// SYR2
|
||||
StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle,
|
||||
|
@ -1752,6 +2052,23 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Syr2(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
n,
|
||||
alpha,
|
||||
x_buffer, x_offset, x_inc,
|
||||
y_buffer, y_offset, y_inc,
|
||||
a_buffer, a_offset, a_ld,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// SPR2
|
||||
StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle,
|
||||
|
@ -1788,6 +2105,23 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle,
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
|
||||
const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
|
||||
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
|
||||
cl_mem ap_buffer, const size_t ap_offset,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Spr2(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
n,
|
||||
alpha,
|
||||
x_buffer, x_offset, x_inc,
|
||||
y_buffer, y_offset, y_inc,
|
||||
ap_buffer, ap_offset,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
// BLAS level-3 (matrix-matrix) routines
|
||||
|
@ -1874,6 +2208,26 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
|
||||
const size_t m, const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Gemm(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Transpose>(a_transpose),
|
||||
static_cast<clblast::Transpose>(b_transpose),
|
||||
m, n, k,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
b_buffer, b_offset, b_ld,
|
||||
beta,
|
||||
c_buffer, c_offset, c_ld,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// SYMM
|
||||
StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle,
|
||||
|
@ -1956,6 +2310,26 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Symm(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Side>(side),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
m, n,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
b_buffer, b_offset, b_ld,
|
||||
beta,
|
||||
c_buffer, c_offset, c_ld,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// HEMM
|
||||
StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle,
|
||||
|
@ -2072,6 +2446,24 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Syrk(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
static_cast<clblast::Transpose>(a_transpose),
|
||||
n, k,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
beta,
|
||||
c_buffer, c_offset, c_ld,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// HERK
|
||||
StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
|
||||
|
@ -2192,6 +2584,26 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||
const size_t n, const size_t k,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
const cl_half beta,
|
||||
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
static_cast<clblast::Transpose>(ab_transpose),
|
||||
n, k,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
b_buffer, b_offset, b_ld,
|
||||
beta,
|
||||
c_buffer, c_offset, c_ld,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// HER2K
|
||||
StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
|
||||
|
@ -2308,6 +2720,24 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Trmm(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Side>(side),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
static_cast<clblast::Transpose>(a_transpose),
|
||||
static_cast<clblast::Diagonal>(diagonal),
|
||||
m, n,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
b_buffer, b_offset, b_ld,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// TRSM
|
||||
StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
|
@ -2382,6 +2812,24 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri
|
|||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
|
||||
const size_t m, const size_t n,
|
||||
const cl_half alpha,
|
||||
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
|
||||
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
|
||||
cl_command_queue* queue, cl_event* event) {
|
||||
auto status = clblast::Trsm(static_cast<clblast::Layout>(layout),
|
||||
static_cast<clblast::Side>(side),
|
||||
static_cast<clblast::Triangle>(triangle),
|
||||
static_cast<clblast::Transpose>(a_transpose),
|
||||
static_cast<clblast::Diagonal>(diagonal),
|
||||
m, n,
|
||||
alpha,
|
||||
a_buffer, a_offset, a_ld,
|
||||
b_buffer, b_offset, b_ld,
|
||||
queue, event);
|
||||
return static_cast<StatusCode>(status);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
|
|
@ -29,15 +29,15 @@ namespace clblast {
|
|||
|
||||
// Initializes the database
|
||||
const std::vector<Database::DatabaseEntry> Database::database = {
|
||||
XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
|
||||
XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
|
||||
XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
|
||||
XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
|
||||
XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
|
||||
CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
|
||||
PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
|
||||
TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
|
||||
PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
|
||||
XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
|
||||
XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
|
||||
XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
|
||||
XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
|
||||
XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
|
||||
CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
|
||||
PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
|
||||
TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
|
||||
PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
|
||||
};
|
||||
|
||||
// =================================================================================================
|
||||
|
|
|
@ -19,11 +19,16 @@ R"(
|
|||
// Parameters set by the tuner or by the database. Here they are given a basic default value in case
|
||||
// this file is used outside of the CLBlast library.
|
||||
#ifndef PRECISION
|
||||
#define PRECISION 32 // Data-types: single or double precision, complex or regular
|
||||
#define PRECISION 32 // Data-types: half, single or double precision, complex or regular
|
||||
#endif
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Enable support for double-precision
|
||||
#if PRECISION == 16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16: enable
|
||||
#endif
|
||||
|
||||
// Enable support for double-precision
|
||||
#if PRECISION == 64 || PRECISION == 6464
|
||||
#if __OPENCL_VERSION__ <= CL_VERSION_1_1
|
||||
|
@ -31,8 +36,19 @@ R"(
|
|||
#endif
|
||||
#endif
|
||||
|
||||
// Half-precision
|
||||
#if PRECISION == 16
|
||||
typedef half real;
|
||||
typedef half2 real2;
|
||||
typedef half4 real4;
|
||||
typedef half8 real8;
|
||||
typedef half16 real16;
|
||||
#define ZERO 0
|
||||
#define ONE 1
|
||||
#define SMALLEST -1.0e14
|
||||
|
||||
// Single-precision
|
||||
#if PRECISION == 32
|
||||
#elif PRECISION == 32
|
||||
typedef float real;
|
||||
typedef float2 real2;
|
||||
typedef float4 real4;
|
||||
|
@ -68,7 +84,7 @@ R"(
|
|||
#define ONE 1.0f
|
||||
#define SMALLEST -1.0e37f
|
||||
|
||||
// Complex Double-precision
|
||||
// Complex double-precision
|
||||
#elif PRECISION == 6464
|
||||
typedef struct cdouble {double x; double y;} real;
|
||||
typedef struct cdouble2 {real x; real y;} real2;
|
||||
|
|
|
@ -23,9 +23,10 @@ R"(
|
|||
|
||||
// Full version of the kernel with offsets and strided accesses
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void Xaxpy(const int n, const real alpha,
|
||||
__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc) {
|
||||
const real alpha = arg_alpha[0];
|
||||
|
||||
// Loops over the work that needs to be done (allows for an arbitrary number of threads)
|
||||
#pragma unroll
|
||||
|
@ -40,9 +41,11 @@ __kernel void Xaxpy(const int n, const real alpha,
|
|||
// Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
|
||||
// dividable by 'VW', 'WGS' and 'WPT'.
|
||||
__attribute__((reqd_work_group_size(WGS, 1, 1)))
|
||||
__kernel void XaxpyFast(const int n, const real alpha,
|
||||
__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
|
||||
const __global realV* restrict xgm,
|
||||
__global realV* ygm) {
|
||||
const real alpha = arg_alpha[0];
|
||||
|
||||
#pragma unroll
|
||||
for (int w=0; w<WPT; ++w) {
|
||||
const int id = w*get_global_size(0) + get_global_id(0);
|
||||
|
|
|
@ -211,13 +211,17 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
|
|||
|
||||
// Full version of the kernel
|
||||
__attribute__((reqd_work_group_size(WGS1, 1, 1)))
|
||||
__kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
|
||||
__kernel void Xgemv(const int m, const int n,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const int a_rotated,
|
||||
const __global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int do_conjugate, const int parameter,
|
||||
const int kl, const int ku) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
|
||||
// Local memory for the vector X
|
||||
__local real xlm[WGS1];
|
||||
|
|
|
@ -95,13 +95,18 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
|
|||
// --> 'a_rotated' is 0
|
||||
// --> 'do_conjugate' is 0
|
||||
__attribute__((reqd_work_group_size(WGS2, 1, 1)))
|
||||
__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
|
||||
__kernel void XgemvFast(const int m, const int n,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const int a_rotated,
|
||||
const __global realVF* restrict agm, const int a_offset, const int a_ld,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int do_conjugate, const int parameter,
|
||||
const int kl, const int ku) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
|
||||
// Local memory for the vector X
|
||||
__local real xlm[WGS2];
|
||||
|
||||
|
@ -192,13 +197,18 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
|
|||
// --> 'a_rotated' is 1
|
||||
// --> 'do_conjugate' is 0
|
||||
__attribute__((reqd_work_group_size(WGS3, 1, 1)))
|
||||
__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
|
||||
__kernel void XgemvFastRot(const int m, const int n,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const int a_rotated,
|
||||
const __global realVFR* restrict agm, const int a_offset, const int a_ld,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* ygm, const int y_offset, const int y_inc,
|
||||
const int do_conjugate, const int parameter,
|
||||
const int kl, const int ku) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
|
||||
// Local memory for the vector X
|
||||
__local real xlm[WGS3];
|
||||
|
||||
|
|
|
@ -19,11 +19,13 @@ R"(
|
|||
|
||||
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
|
||||
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
__kernel void Xger(const int max1, const int max2, const real alpha,
|
||||
__kernel void Xger(const int max1, const int max2,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* ygm, const int y_offset, const int y_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_rowmajor) {
|
||||
const real alpha = arg_alpha[0];
|
||||
|
||||
// Register storage for X and Y
|
||||
real xvalues[WPT];
|
||||
|
|
|
@ -19,10 +19,12 @@ R"(
|
|||
|
||||
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
|
||||
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
__kernel void Xher(const int n, const real alpha,
|
||||
__kernel void Xher(const int n,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_upper, const int is_rowmajor) {
|
||||
const real alpha = arg_alpha[0];
|
||||
|
||||
// Register storage for X and XT
|
||||
real xvalues[WPT];
|
||||
|
|
|
@ -19,11 +19,13 @@ R"(
|
|||
|
||||
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
|
||||
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
|
||||
__kernel void Xher2(const int n, const real alpha,
|
||||
__kernel void Xher2(const int n,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __global real* restrict xgm, const int x_offset, const int x_inc,
|
||||
const __global real* restrict ygm, const int y_offset, const int y_inc,
|
||||
__global real* restrict agm, const int a_offset, const int a_ld,
|
||||
const int is_upper, const int is_rowmajor) {
|
||||
const real alpha = arg_alpha[0];
|
||||
|
||||
// Register storage for X and Y
|
||||
real xvalues[WPT];
|
||||
|
|
|
@ -267,10 +267,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
|
|||
// Main entry point of the kernel. This is the upper-triangular version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
|
||||
const real alpha, const real beta,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
|
||||
// Skip these threads if they do not contain threads contributing to the upper-triangle
|
||||
if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
|
||||
|
@ -304,10 +307,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
|
|||
// Main entry point of the kernel. This is the lower-triangular version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void XgemmLower(const int kSizeN, const int kSizeK,
|
||||
const real alpha, const real beta,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
|
||||
// Skip these threads if they do not contain threads contributing to the lower-triangle
|
||||
if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
|
||||
|
@ -345,10 +351,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
|
|||
// Main entry point of the kernel. This is the regular full version.
|
||||
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
|
||||
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
|
||||
const real alpha, const real beta,
|
||||
const __constant real* restrict arg_alpha,
|
||||
const __constant real* restrict arg_beta,
|
||||
const __global realM* restrict agm,
|
||||
const __global realN* restrict bgm,
|
||||
__global realM* cgm) {
|
||||
const real alpha = arg_alpha[0];
|
||||
const real beta = arg_beta[0];
|
||||
|
||||
// Allocates workgroup-private memory (local memory)
|
||||
#if SA == 1
|
||||
|
|
|
@ -406,6 +406,7 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Routine<half>;
|
||||
template class Routine<float>;
|
||||
template class Routine<double>;
|
||||
template class Routine<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xamax<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xamax<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xamax<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xamax<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -103,6 +104,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xamax<half>;
|
||||
template class Xamax<float>;
|
||||
template class Xamax<double>;
|
||||
template class Xamax<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xasum<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xasum<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xasum<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xasum<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -100,6 +101,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xasum<half>;
|
||||
template class Xasum<float>;
|
||||
template class Xasum<double>;
|
||||
template class Xasum<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xaxpy<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -67,16 +68,20 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
|
|||
const auto program = GetProgramFromCache();
|
||||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
|
||||
// Sets the kernel arguments
|
||||
if (use_fast_kernel) {
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, alpha);
|
||||
kernel.SetArgument(1, alpha_buffer());
|
||||
kernel.SetArgument(2, x_buffer());
|
||||
kernel.SetArgument(3, y_buffer());
|
||||
}
|
||||
else {
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, alpha);
|
||||
kernel.SetArgument(1, alpha_buffer());
|
||||
kernel.SetArgument(2, x_buffer());
|
||||
kernel.SetArgument(3, static_cast<int>(x_offset));
|
||||
kernel.SetArgument(4, static_cast<int>(x_inc));
|
||||
|
@ -107,6 +112,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xaxpy<half>;
|
||||
template class Xaxpy<float>;
|
||||
template class Xaxpy<double>;
|
||||
template class Xaxpy<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xcopy<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xcopy<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xcopy<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xcopy<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -105,6 +106,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xcopy<half>;
|
||||
template class Xcopy<float>;
|
||||
template class Xcopy<double>;
|
||||
template class Xcopy<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xdot<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xdot<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xdot<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xdot<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -108,6 +109,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xdot<half>;
|
||||
template class Xdot<float>;
|
||||
template class Xdot<double>;
|
||||
template class Xdot<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xnrm2<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xnrm2<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xnrm2<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xnrm2<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -100,6 +101,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xnrm2<half>;
|
||||
template class Xnrm2<float>;
|
||||
template class Xnrm2<double>;
|
||||
template class Xnrm2<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xscal<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xscal<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xscal<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xscal<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -99,6 +100,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xscal<half>;
|
||||
template class Xscal<float>;
|
||||
template class Xscal<double>;
|
||||
template class Xscal<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xswap<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xswap<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xswap<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xswap<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -105,6 +106,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xswap<half>;
|
||||
template class Xswap<float>;
|
||||
template class Xswap<double>;
|
||||
template class Xswap<float2>;
|
||||
|
|
|
@ -58,6 +58,7 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xgbmv<half>;
|
||||
template class Xgbmv<float>;
|
||||
template class Xgbmv<double>;
|
||||
template class Xgbmv<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xgemv<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -134,6 +135,12 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
|
|||
local_size = db_["WGS3"];
|
||||
}
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
beta_buffer.Write(queue_, 1, &beta);
|
||||
|
||||
// Retrieves the Xgemv kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
|
@ -142,8 +149,8 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
|
|||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(m_real));
|
||||
kernel.SetArgument(1, static_cast<int>(n_real));
|
||||
kernel.SetArgument(2, alpha);
|
||||
kernel.SetArgument(3, beta);
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(4, static_cast<int>(a_rotated));
|
||||
kernel.SetArgument(5, a_buffer());
|
||||
kernel.SetArgument(6, static_cast<int>(a_offset));
|
||||
|
@ -173,6 +180,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xgemv<half>;
|
||||
template class Xgemv<float>;
|
||||
template class Xgemv<double>;
|
||||
template class Xgemv<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xger<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xger<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xger<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -64,7 +65,11 @@ StatusCode Xger<T>::DoGer(const Layout layout,
|
|||
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Retrieves the Xgemv kernel from the compiled binary
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
auto kernel = Kernel(program, "Xger");
|
||||
|
@ -72,7 +77,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
|
|||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(a_one));
|
||||
kernel.SetArgument(1, static_cast<int>(a_two));
|
||||
kernel.SetArgument(2, alpha);
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, x_buffer());
|
||||
kernel.SetArgument(4, static_cast<int>(x_offset));
|
||||
kernel.SetArgument(5, static_cast<int>(x_inc));
|
||||
|
@ -100,6 +105,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xger<half>;
|
||||
template class Xger<float>;
|
||||
template class Xger<double>;
|
||||
template class Xger<float2>;
|
||||
|
|
|
@ -19,6 +19,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xher<half, half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -43,6 +44,7 @@ template <> float2 Xher<float2,float>::GetAlpha(const float alpha) { return floa
|
|||
template <> double2 Xher<double2,double>::GetAlpha(const double alpha) { return double2{alpha, 0.0}; }
|
||||
template <> float Xher<float,float>::GetAlpha(const float alpha) { return alpha; }
|
||||
template <> double Xher<double,double>::GetAlpha(const double alpha) { return alpha; }
|
||||
template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
|
@ -63,9 +65,6 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
|
|||
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
|
||||
const auto is_rowmajor = (layout == Layout::kRowMajor);
|
||||
|
||||
// Creates a matching version of alpha
|
||||
const auto matching_alpha = GetAlpha(alpha);
|
||||
|
||||
// Tests the matrix and the vectors for validity
|
||||
auto status = StatusCode::kSuccess;
|
||||
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
|
||||
|
@ -77,14 +76,21 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
|
|||
// If alpha is zero an update is not required
|
||||
if (alpha == U{0}) { return StatusCode::kSuccess; }
|
||||
|
||||
// Retrieves the Xgemv kernel from the compiled binary
|
||||
// Creates a matching version of alpha
|
||||
const auto matching_alpha = GetAlpha(alpha);
|
||||
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &matching_alpha);
|
||||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
auto kernel = Kernel(program, "Xher");
|
||||
|
||||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, matching_alpha);
|
||||
kernel.SetArgument(1, alpha_buffer());
|
||||
kernel.SetArgument(2, x_buffer());
|
||||
kernel.SetArgument(3, static_cast<int>(x_offset));
|
||||
kernel.SetArgument(4, static_cast<int>(x_inc));
|
||||
|
@ -110,6 +116,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xher<half, half>;
|
||||
template class Xher<float, float>;
|
||||
template class Xher<double, double>;
|
||||
template class Xher<float2, float>;
|
||||
|
|
|
@ -19,6 +19,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xher2<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -66,14 +67,18 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
|
|||
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
|
||||
if (ErrorIn(status)) { return status; }
|
||||
|
||||
// Retrieves the Xgemv kernel from the compiled binary
|
||||
// Upload the scalar argument as a constant buffer to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
|
||||
// Retrieves the kernel from the compiled binary
|
||||
try {
|
||||
const auto program = GetProgramFromCache();
|
||||
auto kernel = Kernel(program, "Xher2");
|
||||
|
||||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n));
|
||||
kernel.SetArgument(1, alpha);
|
||||
kernel.SetArgument(1, alpha_buffer());
|
||||
kernel.SetArgument(2, x_buffer());
|
||||
kernel.SetArgument(3, static_cast<int>(x_offset));
|
||||
kernel.SetArgument(4, static_cast<int>(x_inc));
|
||||
|
@ -102,6 +107,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xher2<half>;
|
||||
template class Xher2<float>;
|
||||
template class Xher2<double>;
|
||||
template class Xher2<float2>;
|
||||
|
|
|
@ -57,6 +57,7 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xsbmv<half>;
|
||||
template class Xsbmv<float>;
|
||||
template class Xsbmv<double>;
|
||||
|
||||
|
|
|
@ -57,6 +57,7 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xspmv<half>;
|
||||
template class Xspmv<float>;
|
||||
template class Xspmv<double>;
|
||||
|
||||
|
|
|
@ -44,6 +44,7 @@ StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xspr<half>;
|
||||
template class Xspr<float>;
|
||||
template class Xspr<double>;
|
||||
|
||||
|
|
|
@ -46,6 +46,7 @@ StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xspr2<half>;
|
||||
template class Xspr2<float>;
|
||||
template class Xspr2<double>;
|
||||
|
||||
|
|
|
@ -57,6 +57,7 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xsymv<half>;
|
||||
template class Xsymv<float>;
|
||||
template class Xsymv<double>;
|
||||
|
||||
|
|
|
@ -43,6 +43,7 @@ StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xsyr<half>;
|
||||
template class Xsyr<float>;
|
||||
template class Xsyr<double>;
|
||||
|
||||
|
|
|
@ -45,6 +45,7 @@ StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xsyr2<half>;
|
||||
template class Xsyr2<float>;
|
||||
template class Xsyr2<double>;
|
||||
|
||||
|
|
|
@ -72,6 +72,7 @@ StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xtbmv<half>;
|
||||
template class Xtbmv<float>;
|
||||
template class Xtbmv<double>;
|
||||
template class Xtbmv<float2>;
|
||||
|
|
|
@ -72,6 +72,7 @@ StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xtpmv<half>;
|
||||
template class Xtpmv<float>;
|
||||
template class Xtpmv<double>;
|
||||
template class Xtpmv<float2>;
|
||||
|
|
|
@ -72,6 +72,7 @@ StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xtrmv<half>;
|
||||
template class Xtrmv<float>;
|
||||
template class Xtrmv<double>;
|
||||
template class Xtrmv<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xgemm<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -122,6 +123,12 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
|
||||
auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
beta_buffer.Write(queue_, 1, &beta);
|
||||
|
||||
// Events of all kernels (including pre/post processing kernels)
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto emptyEventList = std::vector<Event>();
|
||||
|
@ -169,8 +176,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
kernel.SetArgument(0, static_cast<int>(m_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(2, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(3, alpha);
|
||||
kernel.SetArgument(4, beta);
|
||||
kernel.SetArgument(3, alpha_buffer());
|
||||
kernel.SetArgument(4, beta_buffer());
|
||||
kernel.SetArgument(5, a_temp());
|
||||
kernel.SetArgument(6, b_temp());
|
||||
kernel.SetArgument(7, c_temp());
|
||||
|
@ -207,6 +214,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xgemm<half>;
|
||||
template class Xgemm<float>;
|
||||
template class Xgemm<double>;
|
||||
template class Xgemm<float2>;
|
||||
|
|
|
@ -112,6 +112,13 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
|
|||
auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
|
||||
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
auto complex_beta = T{beta, static_cast<U>(0.0)};
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
beta_buffer.Write(queue_, 1, &complex_beta);
|
||||
|
||||
// Events of all kernels (including pre/post processing kernels)
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto emptyEventList = std::vector<Event>();
|
||||
|
@ -171,11 +178,10 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
|
|||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
auto complex_beta = T{beta, static_cast<U>(0.0)};
|
||||
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(2, alpha);
|
||||
kernel.SetArgument(3, complex_beta);
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(4, a1_temp());
|
||||
kernel.SetArgument(5, b2_temp());
|
||||
kernel.SetArgument(6, c_temp());
|
||||
|
@ -196,8 +202,10 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
|
|||
// Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
|
||||
auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
|
||||
auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
|
||||
kernel.SetArgument(2, conjugate_alpha);
|
||||
kernel.SetArgument(3, complex_one);
|
||||
alpha_buffer.Write(queue_, 1, &conjugate_alpha);
|
||||
beta_buffer.Write(queue_, 1, &complex_one);
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(4, b1_temp());
|
||||
kernel.SetArgument(5, a2_temp());
|
||||
|
||||
|
|
|
@ -103,6 +103,14 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
|
|||
auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
|
||||
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
auto complex_alpha = T{alpha, static_cast<U>(0.0)};
|
||||
auto complex_beta = T{beta, static_cast<U>(0.0)};
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &complex_alpha);
|
||||
beta_buffer.Write(queue_, 1, &complex_beta);
|
||||
|
||||
// Events of all kernels (including pre/post processing kernels)
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto emptyEventList = std::vector<Event>();
|
||||
|
@ -144,12 +152,10 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
|
|||
auto kernel = Kernel(program, kernel_name);
|
||||
|
||||
// Sets the kernel arguments
|
||||
auto complex_alpha = T{alpha, static_cast<U>(0.0)};
|
||||
auto complex_beta = T{beta, static_cast<U>(0.0)};
|
||||
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(2, complex_alpha);
|
||||
kernel.SetArgument(3, complex_beta);
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(4, a_temp());
|
||||
kernel.SetArgument(5, b_temp());
|
||||
kernel.SetArgument(6, c_temp());
|
||||
|
|
|
@ -127,6 +127,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xsymm<half>;
|
||||
template class Xsymm<float>;
|
||||
template class Xsymm<double>;
|
||||
template class Xsymm<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xsyr2k<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -104,6 +105,12 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
|
|||
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
|
||||
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
beta_buffer.Write(queue_, 1, &beta);
|
||||
|
||||
// Events of all kernels (including pre/post processing kernels)
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto emptyEventList = std::vector<Event>();
|
||||
|
@ -147,8 +154,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
|
|||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(2, alpha);
|
||||
kernel.SetArgument(3, beta);
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(4, a_temp());
|
||||
kernel.SetArgument(5, b_temp());
|
||||
kernel.SetArgument(6, c_temp());
|
||||
|
@ -168,7 +175,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
|
|||
|
||||
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
|
||||
auto one = static_cast<T>(1);
|
||||
kernel.SetArgument(3, one);
|
||||
beta_buffer.Write(queue_, 1, &one);
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(4, b_temp());
|
||||
kernel.SetArgument(5, a_temp());
|
||||
|
||||
|
@ -196,6 +204,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xsyr2k<half>;
|
||||
template class Xsyr2k<float>;
|
||||
template class Xsyr2k<double>;
|
||||
template class Xsyr2k<float2>;
|
||||
|
|
|
@ -20,6 +20,7 @@ namespace clblast {
|
|||
// =================================================================================================
|
||||
|
||||
// Specific implementations to get the memory-type based on a template argument
|
||||
template <> const Precision Xsyrk<half>::precision_ = Precision::kHalf;
|
||||
template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
|
||||
template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
|
||||
template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
|
||||
|
@ -97,6 +98,12 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
|
|||
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
|
||||
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
|
||||
|
||||
// Upload the scalar arguments as constant buffers to the device (needed for half-precision)
|
||||
auto alpha_buffer = Buffer<T>(context_, 1);
|
||||
auto beta_buffer = Buffer<T>(context_, 1);
|
||||
alpha_buffer.Write(queue_, 1, &alpha);
|
||||
beta_buffer.Write(queue_, 1, &beta);
|
||||
|
||||
// Events of all kernels (including pre/post processing kernels)
|
||||
auto eventWaitList = std::vector<Event>();
|
||||
auto emptyEventList = std::vector<Event>();
|
||||
|
@ -131,8 +138,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
|
|||
// Sets the kernel arguments
|
||||
kernel.SetArgument(0, static_cast<int>(n_ceiled));
|
||||
kernel.SetArgument(1, static_cast<int>(k_ceiled));
|
||||
kernel.SetArgument(2, alpha);
|
||||
kernel.SetArgument(3, beta);
|
||||
kernel.SetArgument(2, alpha_buffer());
|
||||
kernel.SetArgument(3, beta_buffer());
|
||||
kernel.SetArgument(4, a_temp());
|
||||
kernel.SetArgument(5, a_temp());
|
||||
kernel.SetArgument(6, c_temp());
|
||||
|
@ -169,6 +176,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xsyrk<half>;
|
||||
template class Xsyrk<float>;
|
||||
template class Xsyrk<double>;
|
||||
template class Xsyrk<float2>;
|
||||
|
|
|
@ -130,6 +130,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
|
|||
// =================================================================================================
|
||||
|
||||
// Compiles the templated class
|
||||
template class Xtrmm<half>;
|
||||
template class Xtrmm<float>;
|
||||
template class Xtrmm<double>;
|
||||
template class Xtrmm<float2>;
|
||||
|
|
|
@ -107,7 +107,7 @@ using double2 = clblast::double2;
|
|||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
switch(clblast::GetPrecision(argc, argv)) {
|
||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneCopy<half>, half>(argc, argv); break;
|
||||
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneCopy<float>, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneCopy<double>, double>(argc, argv); break;
|
||||
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneCopy<float2>, float2>(argc, argv); break;
|
||||
|
|
|
@ -85,17 +85,17 @@ class TunePad {
|
|||
std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(0);
|
||||
tuner.AddArgumentInput(a_mat);
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(0);
|
||||
tuner.AddArgumentOutput(b_mat);
|
||||
tuner.AddArgumentScalar(0);
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(0);
|
||||
tuner.AddArgumentInput(a_mat);
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(0);
|
||||
tuner.AddArgumentOutput(b_mat);
|
||||
tuner.AddArgumentScalar(0);
|
||||
}
|
||||
|
||||
// Describes how to compute the performance metrics
|
||||
|
@ -115,7 +115,7 @@ using double2 = clblast::double2;
|
|||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
switch(clblast::GetPrecision(argc, argv)) {
|
||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePad<half>, half>(argc, argv); break;
|
||||
case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePad<float>, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePad<double>, double>(argc, argv); break;
|
||||
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePad<float2>, float2>(argc, argv); break;
|
||||
|
|
|
@ -119,7 +119,7 @@ using double2 = clblast::double2;
|
|||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
switch(clblast::GetPrecision(argc, argv)) {
|
||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePadTranspose<half>, half>(argc, argv); break;
|
||||
case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePadTranspose<float>, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePadTranspose<double>, double>(argc, argv); break;
|
||||
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePadTranspose<float2>, float2>(argc, argv); break;
|
||||
|
|
|
@ -112,7 +112,7 @@ using double2 = clblast::double2;
|
|||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
switch(clblast::GetPrecision(argc, argv)) {
|
||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneTranspose<half>, half>(argc, argv); break;
|
||||
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneTranspose<float>, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneTranspose<double>, double>(argc, argv); break;
|
||||
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneTranspose<float2>, float2>(argc, argv); break;
|
||||
|
|
|
@ -89,8 +89,9 @@ class TuneXaxpy {
|
|||
std::vector<T> &x_vec, std::vector<T> &y_vec,
|
||||
std::vector<T> &, std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(args.alpha);
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentInput(x_vec);
|
||||
tuner.AddArgumentOutput(y_vec);
|
||||
}
|
||||
|
@ -112,7 +113,7 @@ using double2 = clblast::double2;
|
|||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
switch(clblast::GetPrecision(argc, argv)) {
|
||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXaxpy<half>, half>(argc, argv); break;
|
||||
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXaxpy<float>, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXaxpy<double>, double>(argc, argv); break;
|
||||
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXaxpy<float2>, float2>(argc, argv); break;
|
||||
|
|
|
@ -119,7 +119,7 @@ using double2 = clblast::double2;
|
|||
template <int V>
|
||||
void StartVariation(int argc, char *argv[]) {
|
||||
switch(clblast::GetPrecision(argc, argv)) {
|
||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXdot<half, V>, half>(argc, argv); break;
|
||||
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXdot<float, V>, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXdot<double, V>, double>(argc, argv); break;
|
||||
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXdot<float2, V>, float2>(argc, argv); break;
|
||||
|
|
|
@ -121,11 +121,13 @@ class TuneXgemm {
|
|||
std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
auto beta_buffer = std::vector<T>{args.beta};
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.k));
|
||||
tuner.AddArgumentScalar(args.alpha);
|
||||
tuner.AddArgumentScalar(args.beta);
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentInput(beta_buffer);
|
||||
tuner.AddArgumentInput(a_mat);
|
||||
tuner.AddArgumentInput(b_mat);
|
||||
tuner.AddArgumentOutput(c_mat);
|
||||
|
@ -148,7 +150,7 @@ using double2 = clblast::double2;
|
|||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
switch(clblast::GetPrecision(argc, argv)) {
|
||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half>, half>(argc, argv); break;
|
||||
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float>, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double>, double>(argc, argv); break;
|
||||
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2>, float2>(argc, argv); break;
|
||||
|
|
|
@ -96,11 +96,13 @@ class TuneXgemv {
|
|||
std::vector<T> &x_vec, std::vector<T> &y_vec,
|
||||
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
auto beta_buffer = std::vector<T>{args.beta};
|
||||
auto a_rotated = (V==3) ? 1 : 0;
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(args.alpha);
|
||||
tuner.AddArgumentScalar(args.beta);
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentInput(beta_buffer);
|
||||
tuner.AddArgumentScalar(static_cast<int>(a_rotated));
|
||||
tuner.AddArgumentInput(a_mat);
|
||||
tuner.AddArgumentScalar(0);
|
||||
|
@ -135,7 +137,7 @@ using double2 = clblast::double2;
|
|||
template <int V>
|
||||
void StartVariation(int argc, char *argv[]) {
|
||||
switch(clblast::GetPrecision(argc, argv)) {
|
||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemv<half,V>, half>(argc, argv); break;
|
||||
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemv<float,V>, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemv<double,V>, double>(argc, argv); break;
|
||||
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemv<float2,V>, float2>(argc, argv); break;
|
||||
|
|
|
@ -85,9 +85,10 @@ class TuneXger {
|
|||
std::vector<T> &x_vec, std::vector<T> &y_vec,
|
||||
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
|
||||
std::vector<T> &) {
|
||||
auto alpha_buffer = std::vector<T>{args.alpha};
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.m));
|
||||
tuner.AddArgumentScalar(static_cast<int>(args.n));
|
||||
tuner.AddArgumentScalar(args.alpha);
|
||||
tuner.AddArgumentInput(alpha_buffer);
|
||||
tuner.AddArgumentInput(x_vec);
|
||||
tuner.AddArgumentScalar(0); // x_offset
|
||||
tuner.AddArgumentScalar(1); // x_increment
|
||||
|
@ -117,7 +118,7 @@ using double2 = clblast::double2;
|
|||
// Main function (not within the clblast namespace)
|
||||
int main(int argc, char *argv[]) {
|
||||
switch(clblast::GetPrecision(argc, argv)) {
|
||||
case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
|
||||
case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXger<half>, half>(argc, argv); break;
|
||||
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXger<float>, float>(argc, argv); break;
|
||||
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXger<double>, double>(argc, argv); break;
|
||||
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXger<float2>, float2>(argc, argv); break;
|
||||
|
|
115
src/utilities.cc
115
src/utilities.cc
|
@ -22,6 +22,56 @@
|
|||
namespace clblast {
|
||||
// =================================================================================================
|
||||
|
||||
// Returns a scalar with a default value
|
||||
template <typename T>
|
||||
T GetScalar() {
|
||||
return static_cast<T>(2.0);
|
||||
}
|
||||
template float GetScalar<float>();
|
||||
template double GetScalar<double>();
|
||||
|
||||
// Specialized version of the above for half-precision
|
||||
template <>
|
||||
half GetScalar() {
|
||||
return FloatToHalf(2.0f);
|
||||
}
|
||||
|
||||
// Specialized versions of the above for complex data-types
|
||||
template <>
|
||||
float2 GetScalar() {
|
||||
return {2.0f, 0.5f};
|
||||
}
|
||||
template <>
|
||||
double2 GetScalar() {
|
||||
return {2.0, 0.5};
|
||||
}
|
||||
|
||||
// Returns a scalar of value 1
|
||||
template <typename T>
|
||||
T ConstantOne() {
|
||||
return static_cast<T>(1.0);
|
||||
}
|
||||
template float ConstantOne<float>();
|
||||
template double ConstantOne<double>();
|
||||
|
||||
// Specialized version of the above for half-precision
|
||||
template <>
|
||||
half ConstantOne() {
|
||||
return FloatToHalf(1.0f);
|
||||
}
|
||||
|
||||
// Specialized versions of the above for complex data-types
|
||||
template <>
|
||||
float2 ConstantOne() {
|
||||
return {1.0f, 0.0f};
|
||||
}
|
||||
template <>
|
||||
double2 ConstantOne() {
|
||||
return {1.0, 0.0};
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Implements the string conversion using std::to_string if possible
|
||||
template <typename T>
|
||||
std::string ToString(T value) {
|
||||
|
@ -48,6 +98,12 @@ std::string ToString(double2 value) {
|
|||
return real.str()+"+"+imag.str()+"i";
|
||||
}
|
||||
|
||||
// If not possible directly: special case for half-precision
|
||||
template <>
|
||||
std::string ToString(half value) {
|
||||
return std::to_string(HalfToFloat(value));
|
||||
}
|
||||
|
||||
// If not possible directly: special cases for CLBlast data-types
|
||||
template <>
|
||||
std::string ToString(Layout value) {
|
||||
|
@ -105,6 +161,9 @@ template <typename T>
|
|||
T ConvertArgument(const char* value) {
|
||||
return static_cast<T>(std::stoi(value));
|
||||
}
|
||||
template <> half ConvertArgument(const char* value) {
|
||||
return FloatToHalf(static_cast<float>(std::stod(value)));
|
||||
}
|
||||
template <> float ConvertArgument(const char* value) {
|
||||
return static_cast<float>(std::stod(value));
|
||||
}
|
||||
|
@ -147,6 +206,7 @@ T GetArgument(const int argc, char *argv[], std::string &help,
|
|||
// Compiles the above function
|
||||
template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int);
|
||||
template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t);
|
||||
template half GetArgument<half>(const int, char **, std::string&, const std::string&, const half);
|
||||
template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float);
|
||||
template double GetArgument<double>(const int, char **, std::string&, const std::string&, const double);
|
||||
template float2 GetArgument<float2>(const int, char **, std::string&, const std::string&, const float2);
|
||||
|
@ -227,24 +287,49 @@ void PopulateVector(std::vector<double2> &vector) {
|
|||
for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
|
||||
}
|
||||
|
||||
// Specialized versions of the above for half-precision
|
||||
template <>
|
||||
void PopulateVector(std::vector<half> &vector) {
|
||||
const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
|
||||
const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
|
||||
std::mt19937 mt(GetRandomSeed());
|
||||
std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
|
||||
for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// Returns a scalar with a default value
|
||||
template <typename T>
|
||||
T GetScalar() {
|
||||
return static_cast<T>(2.0);
|
||||
// Conversion between half and single-precision
|
||||
std::vector<float> HalfToFloatBuffer(const std::vector<half>& source) {
|
||||
auto result = std::vector<float>(source.size());
|
||||
for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); }
|
||||
return result;
|
||||
}
|
||||
void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source) {
|
||||
for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); }
|
||||
}
|
||||
template float GetScalar<float>();
|
||||
template double GetScalar<double>();
|
||||
|
||||
// Specialized versions of the above for complex data-types
|
||||
template <>
|
||||
float2 GetScalar() {
|
||||
return {2.0f, 0.5f};
|
||||
// As above, but now for OpenCL data-types instead of std::vectors
|
||||
Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw) {
|
||||
const auto size = source.GetSize() / sizeof(half);
|
||||
auto queue = Queue(queue_raw);
|
||||
auto context = queue.GetContext();
|
||||
auto source_cpu = std::vector<half>(size);
|
||||
source.Read(queue, size, source_cpu);
|
||||
auto result_cpu = HalfToFloatBuffer(source_cpu);
|
||||
auto result = Buffer<float>(context, size);
|
||||
result.Write(queue, size, result_cpu);
|
||||
return result;
|
||||
}
|
||||
template <>
|
||||
double2 GetScalar() {
|
||||
return {2.0, 0.5};
|
||||
void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw) {
|
||||
const auto size = source.GetSize() / sizeof(float);
|
||||
auto queue = Queue(queue_raw);
|
||||
auto context = queue.GetContext();
|
||||
auto source_cpu = std::vector<float>(size);
|
||||
source.Read(queue, size, source_cpu);
|
||||
auto result_cpu = std::vector<half>(size);
|
||||
FloatToHalfBuffer(result_cpu, source_cpu);
|
||||
result.Write(queue, size, result_cpu);
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
@ -288,6 +373,10 @@ template <> bool PrecisionSupported<double2>(const Device &device) {
|
|||
auto extensions = device.Capabilities();
|
||||
return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
|
||||
}
|
||||
template <> bool PrecisionSupported<half>(const Device &device) {
|
||||
auto extensions = device.Capabilities();
|
||||
return (extensions.find(kKhronosHalfPrecision) == std::string::npos) ? false : true;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
} // namespace clblast
|
||||
|
|
|
@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
|
|||
clblast::RunTests<clblast::TestXamax<double>, double, double>(argc, argv, true, "iDAMAX");
|
||||
clblast::RunTests<clblast::TestXamax<float2>, float2, float2>(argc, argv, true, "iCAMAX");
|
||||
clblast::RunTests<clblast::TestXamax<double2>, double2, double2>(argc, argv, true, "iZAMAX");
|
||||
clblast::RunTests<clblast::TestXamax<half>, half, half>(argc, argv, true, "iHAMAX");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
|
|||
clblast::RunTests<clblast::TestXasum<double>, double, double>(argc, argv, true, "DASUM");
|
||||
clblast::RunTests<clblast::TestXasum<float2>, float2, float2>(argc, argv, true, "ScASUM");
|
||||
clblast::RunTests<clblast::TestXasum<double2>, double2, double2>(argc, argv, true, "DzASUM");
|
||||
clblast::RunTests<clblast::TestXasum<half>, half, half>(argc, argv, true, "HASUM");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
|
|||
clblast::RunTests<clblast::TestXaxpy<double>, double, double>(argc, argv, true, "DAXPY");
|
||||
clblast::RunTests<clblast::TestXaxpy<float2>, float2, float2>(argc, argv, true, "CAXPY");
|
||||
clblast::RunTests<clblast::TestXaxpy<double2>, double2, double2>(argc, argv, true, "ZAXPY");
|
||||
clblast::RunTests<clblast::TestXaxpy<half>, half, half>(argc, argv, true, "HAXPY");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
|
|||
clblast::RunTests<clblast::TestXcopy<double>, double, double>(argc, argv, true, "DCOPY");
|
||||
clblast::RunTests<clblast::TestXcopy<float2>, float2, float2>(argc, argv, true, "CCOPY");
|
||||
clblast::RunTests<clblast::TestXcopy<double2>, double2, double2>(argc, argv, true, "ZCOPY");
|
||||
clblast::RunTests<clblast::TestXcopy<half>, half, half>(argc, argv, true, "HCOPY");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ using double2 = clblast::double2;
|
|||
int main(int argc, char *argv[]) {
|
||||
clblast::RunTests<clblast::TestXdot<float>, float, float>(argc, argv, false, "SDOT");
|
||||
clblast::RunTests<clblast::TestXdot<double>, double, double>(argc, argv, true, "DDOT");
|
||||
clblast::RunTests<clblast::TestXdot<half>, half, half>(argc, argv, true, "HDOT");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
|
|||
clblast::RunTests<clblast::TestXnrm2<double>, double, double>(argc, argv, true, "DNRM2");
|
||||
clblast::RunTests<clblast::TestXnrm2<float2>, float2, float2>(argc, argv, true, "ScNRM2");
|
||||
clblast::RunTests<clblast::TestXnrm2<double2>, double2, double2>(argc, argv, true, "DzNRM2");
|
||||
clblast::RunTests<clblast::TestXnrm2<half>, half, half>(argc, argv, true, "HNRM2");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
|
|||
clblast::RunTests<clblast::TestXscal<double>, double, double>(argc, argv, true, "DSCAL");
|
||||
clblast::RunTests<clblast::TestXscal<float2>, float2, float2>(argc, argv, true, "CSCAL");
|
||||
clblast::RunTests<clblast::TestXscal<double2>, double2, double2>(argc, argv, true, "ZSCAL");
|
||||
clblast::RunTests<clblast::TestXscal<half>, half, half>(argc, argv, true, "HSCAL");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
|
|||
clblast::RunTests<clblast::TestXswap<double>, double, double>(argc, argv, true, "DSWAP");
|
||||
clblast::RunTests<clblast::TestXswap<float2>, float2, float2>(argc, argv, true, "CSWAP");
|
||||
clblast::RunTests<clblast::TestXswap<double2>, double2, double2>(argc, argv, true, "ZSWAP");
|
||||
clblast::RunTests<clblast::TestXswap<half>, half, half>(argc, argv, true, "HSWAP");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
|
|||
clblast::RunTests<clblast::TestXgbmv<double>, double, double>(argc, argv, true, "DGBMV");
|
||||
clblast::RunTests<clblast::TestXgbmv<float2>, float2, float2>(argc, argv, true, "CGBMV");
|
||||
clblast::RunTests<clblast::TestXgbmv<double2>, double2, double2>(argc, argv, true, "ZGBMV");
|
||||
clblast::RunTests<clblast::TestXgbmv<half>, half, half>(argc, argv, true, "HGBMV");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
|
|||
clblast::RunTests<clblast::TestXgemv<double>, double, double>(argc, argv, true, "DGEMV");
|
||||
clblast::RunTests<clblast::TestXgemv<float2>, float2, float2>(argc, argv, true, "CGEMV");
|
||||
clblast::RunTests<clblast::TestXgemv<double2>, double2, double2>(argc, argv, true, "ZGEMV");
|
||||
clblast::RunTests<clblast::TestXgemv<half>, half, half>(argc, argv, true, "HGEMV");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ using double2 = clblast::double2;
|
|||
int main(int argc, char *argv[]) {
|
||||
clblast::RunTests<clblast::TestXger<float>, float, float>(argc, argv, false, "SGER");
|
||||
clblast::RunTests<clblast::TestXger<double>, double, double>(argc, argv, true, "DGER");
|
||||
clblast::RunTests<clblast::TestXger<half>, half, half>(argc, argv, true, "HGER");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ using double2 = clblast::double2;
|
|||
int main(int argc, char *argv[]) {
|
||||
clblast::RunTests<clblast::TestXsbmv<float>, float, float>(argc, argv, false, "SSBMV");
|
||||
clblast::RunTests<clblast::TestXsbmv<double>, double, double>(argc, argv, true, "DSBMV");
|
||||
clblast::RunTests<clblast::TestXsbmv<half>, half, half>(argc, argv, true, "HSBMV");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ using double2 = clblast::double2;
|
|||
int main(int argc, char *argv[]) {
|
||||
clblast::RunTests<clblast::TestXspmv<float>, float, float>(argc, argv, false, "SSPMV");
|
||||
clblast::RunTests<clblast::TestXspmv<double>, double, double>(argc, argv, true, "DSPMV");
|
||||
clblast::RunTests<clblast::TestXspmv<half>, half, half>(argc, argv, true, "HSPMV");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ using double2 = clblast::double2;
|
|||
int main(int argc, char *argv[]) {
|
||||
clblast::RunTests<clblast::TestXspr<float>, float, float>(argc, argv, false, "SSPR");
|
||||
clblast::RunTests<clblast::TestXspr<double>, double, double>(argc, argv, true, "DSPR");
|
||||
clblast::RunTests<clblast::TestXspr<half>, half, half>(argc, argv, true, "HSPR");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ using double2 = clblast::double2;
|
|||
int main(int argc, char *argv[]) {
|
||||
clblast::RunTests<clblast::TestXspr2<float>, float, float>(argc, argv, false, "SSPR2");
|
||||
clblast::RunTests<clblast::TestXspr2<double>, double, double>(argc, argv, true, "DSPR2");
|
||||
clblast::RunTests<clblast::TestXspr2<half>, half, half>(argc, argv, true, "HSPR2");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ using double2 = clblast::double2;
|
|||
int main(int argc, char *argv[]) {
|
||||
clblast::RunTests<clblast::TestXsymv<float>, float, float>(argc, argv, false, "SSYMV");
|
||||
clblast::RunTests<clblast::TestXsymv<double>, double, double>(argc, argv, true, "DSYMV");
|
||||
clblast::RunTests<clblast::TestXsymv<half>, half, half>(argc, argv, true, "HSYMV");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ using double2 = clblast::double2;
|
|||
int main(int argc, char *argv[]) {
|
||||
clblast::RunTests<clblast::TestXsyr<float>, float, float>(argc, argv, false, "SSYR");
|
||||
clblast::RunTests<clblast::TestXsyr<double>, double, double>(argc, argv, true, "DSYR");
|
||||
clblast::RunTests<clblast::TestXsyr<half>, half, half>(argc, argv, true, "HSYR");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ using double2 = clblast::double2;
|
|||
int main(int argc, char *argv[]) {
|
||||
clblast::RunTests<clblast::TestXsyr2<float>, float, float>(argc, argv, false, "SSYR2");
|
||||
clblast::RunTests<clblast::TestXsyr2<double>, double, double>(argc, argv, true, "DSYR2");
|
||||
clblast::RunTests<clblast::TestXsyr2<half>, half, half>(argc, argv, true, "HSYR2");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ int main(int argc, char *argv[]) {
|
|||
clblast::RunTests<clblast::TestXtbmv<double>, double, double>(argc, argv, true, "DTBMV");
|
||||
clblast::RunTests<clblast::TestXtbmv<float2>, float2, float2>(argc, argv, true, "CTBMV");
|
||||
clblast::RunTests<clblast::TestXtbmv<double2>, double2, double2>(argc, argv, true, "ZTBMV");
|
||||
clblast::RunTests<clblast::TestXtbmv<half>, half, half>(argc, argv, true, "HTBMV");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue